From 7ce069c8731e4d98b8fa903d50fb9b7ae5ba636a Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 10 Apr 2021 14:08:02 +0700 Subject: [PATCH 01/13] :rocket: init refactoring --- setup.py | 2 +- tensorflow_asr/metrics/__init__.py | 0 tensorflow_asr/metrics/error_rates.py | 33 ++++ tensorflow_asr/models/__init__.py | 75 --------- tensorflow_asr/models/activations/__init__.py | 0 .../{activations.py => activations/glu.py} | 0 tensorflow_asr/models/base_model.py | 75 +++++++++ tensorflow_asr/models/conformer.py | 6 +- tensorflow_asr/models/contextnet.py | 20 ++- tensorflow_asr/models/ctc.py | 55 +++++- tensorflow_asr/models/deepspeech2.py | 8 +- tensorflow_asr/models/jasper.py | 4 +- tensorflow_asr/models/streaming_transducer.py | 10 +- tensorflow_asr/models/transducer.py | 103 ++++++++---- tensorflow_asr/utils/__init__.py | 74 --------- tensorflow_asr/utils/env_util.py | 77 +++++++++ tensorflow_asr/utils/feature_util.py | 27 +++ tensorflow_asr/utils/file_util.py | 57 +++++++ tensorflow_asr/utils/layer_util.py | 29 ++++ .../utils/{utils.py => math_util.py} | 156 ++---------------- .../utils/{metrics.py => metric_util.py} | 43 ++--- tensorflow_asr/utils/shape_util.py | 32 ++++ 22 files changed, 507 insertions(+), 379 deletions(-) create mode 100644 tensorflow_asr/metrics/__init__.py create mode 100644 tensorflow_asr/metrics/error_rates.py create mode 100644 tensorflow_asr/models/activations/__init__.py rename tensorflow_asr/models/{activations.py => activations/glu.py} (100%) mode change 100755 => 100644 create mode 100644 tensorflow_asr/models/base_model.py create mode 100644 tensorflow_asr/utils/env_util.py create mode 100644 tensorflow_asr/utils/feature_util.py create mode 100644 tensorflow_asr/utils/file_util.py create mode 100644 tensorflow_asr/utils/layer_util.py rename tensorflow_asr/utils/{utils.py => math_util.py} (53%) mode change 100755 => 100644 rename tensorflow_asr/utils/{metrics.py => metric_util.py} (67%) create mode 100644 tensorflow_asr/utils/shape_util.py diff --git a/setup.py b/setup.py index a2c415d29e..717cbcddfd 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ setuptools.setup( name="TensorFlowASR", - version="0.8.3", + version="1.0.0", author="Huy Le Nguyen", author_email="nlhuy.cs.16@gmail.com", description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2", diff --git a/tensorflow_asr/metrics/__init__.py b/tensorflow_asr/metrics/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/metrics/error_rates.py b/tensorflow_asr/metrics/error_rates.py new file mode 100644 index 0000000000..143e199109 --- /dev/null +++ b/tensorflow_asr/metrics/error_rates.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + + +class ErrorRate(tf.keras.metrics.Metric): + """ Metric for WER or CER """ + + def __init__(self, func, name="error_rate", **kwargs): + super(ErrorRate, self).__init__(name=name, **kwargs) + self.numerator = self.add_weight(name=f"{name}_numerator", initializer="zeros") + self.denominator = self.add_weight(name=f"{name}_denominator", initializer="zeros") + self.func = func + + def update_state(self, decode: tf.Tensor, target: tf.Tensor): + n, d = self.func(decode, target) + self.numerator.assign_add(n) + self.denominator.assign_add(d) + + def result(self): + return tf.math.divide_no_nan(self.numerator, self.denominator) * 100 diff --git a/tensorflow_asr/models/__init__.py b/tensorflow_asr/models/__init__.py index 7f37b4ffb1..e69de29bb2 100644 --- a/tensorflow_asr/models/__init__.py +++ b/tensorflow_asr/models/__init__.py @@ -1,75 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import abc -import tempfile -import tensorflow as tf - -from ..utils.utils import is_cloud_path, is_hdf5_filepath - - -class Model(tf.keras.Model): - def __init__(self, name, **kwargs): - super(Model, self).__init__(name=name, **kwargs) - - def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None, - signatures=None, options=None, save_traces=True): - if is_cloud_path(filepath) and is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - super(Model, self).save( - tmp.name, overwrite=overwrite, include_optimizer=include_optimizer, - save_format=save_format, signatures=signatures, options=options, save_traces=save_traces - ) - tf.io.gfile.copy(tmp.name, filepath, overwrite=True) - else: - super(Model, self).save( - filepath, overwrite=overwrite, include_optimizer=include_optimizer, - save_format=save_format, signatures=signatures, options=options, save_traces=save_traces - ) - - def save_weights(self, filepath, overwrite=True, save_format=None, options=None): - if is_cloud_path(filepath) and is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options) - tf.io.gfile.copy(tmp.name, filepath, overwrite=True) - else: - super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options) - - def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None): - if is_cloud_path(filepath) and is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - tf.io.gfile.copy(filepath, tmp.name, overwrite=True) - super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options) - else: - super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options) - - @abc.abstractmethod - def _build(self, *args, **kwargs): - raise NotImplementedError() - - @abc.abstractmethod - def call(self, inputs, training=False, **kwargs): - raise NotImplementedError() - - @abc.abstractmethod - def recognize(self, features, input_lengths, **kwargs): - pass - - @abc.abstractmethod - def recognize_beam(self, features, input_lengths, **kwargs): - pass diff --git a/tensorflow_asr/models/activations/__init__.py b/tensorflow_asr/models/activations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/activations.py b/tensorflow_asr/models/activations/glu.py old mode 100755 new mode 100644 similarity index 100% rename from tensorflow_asr/models/activations.py rename to tensorflow_asr/models/activations/glu.py diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py new file mode 100644 index 0000000000..c545577abc --- /dev/null +++ b/tensorflow_asr/models/base_model.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import abc +import tempfile +import tensorflow as tf + +from ..utils import file_util + + +class Model(tf.keras.Model): + def __init__(self, name, **kwargs): + super(Model, self).__init__(name=name, **kwargs) + + def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None, + signatures=None, options=None, save_traces=True): + if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): + _, ext = os.path.splitext(filepath) + with tempfile.NamedTemporaryFile(suffix=ext) as tmp: + super(Model, self).save( + tmp.name, overwrite=overwrite, include_optimizer=include_optimizer, + save_format=save_format, signatures=signatures, options=options, save_traces=save_traces + ) + tf.io.gfile.copy(tmp.name, filepath, overwrite=True) + else: + super(Model, self).save( + filepath, overwrite=overwrite, include_optimizer=include_optimizer, + save_format=save_format, signatures=signatures, options=options, save_traces=save_traces + ) + + def save_weights(self, filepath, overwrite=True, save_format=None, options=None): + if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): + _, ext = os.path.splitext(filepath) + with tempfile.NamedTemporaryFile(suffix=ext) as tmp: + super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options) + tf.io.gfile.copy(tmp.name, filepath, overwrite=True) + else: + super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options) + + def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None): + if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): + _, ext = os.path.splitext(filepath) + with tempfile.NamedTemporaryFile(suffix=ext) as tmp: + tf.io.gfile.copy(filepath, tmp.name, overwrite=True) + super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options) + else: + super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options) + + @abc.abstractmethod + def _build(self, *args, **kwargs): + raise NotImplementedError() + + @abc.abstractmethod + def call(self, inputs, training=False, **kwargs): + raise NotImplementedError() + + @abc.abstractmethod + def recognize(self, features, input_lengths, **kwargs): + pass + + @abc.abstractmethod + def recognize_beam(self, features, input_lengths, **kwargs): + pass diff --git a/tensorflow_asr/models/conformer.py b/tensorflow_asr/models/conformer.py index 0fa3585ce4..a13dfa1d19 100755 --- a/tensorflow_asr/models/conformer.py +++ b/tensorflow_asr/models/conformer.py @@ -14,12 +14,12 @@ import tensorflow as tf -from .activations import GLU +from .activations.glu import GLU from .transducer import Transducer from .layers.subsampling import VggSubsampling, Conv2dSubsampling from .layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat from .layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention -from ..utils.utils import shape_list +from ..utils import shape_util L2 = tf.keras.regularizers.l2(1e-6) @@ -179,7 +179,7 @@ def __init__(self, def call(self, inputs, training=False, **kwargs): outputs = self.ln(inputs, training=training) - B, T, E = shape_list(outputs) + B, T, E = shape_util.shape_list(outputs) outputs = tf.reshape(outputs, [B, T, 1, E]) outputs = self.pw_conv_1(outputs, training=training) outputs = self.glu(outputs) diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/contextnet.py index 8bc4e12857..636560101d 100644 --- a/tensorflow_asr/models/contextnet.py +++ b/tensorflow_asr/models/contextnet.py @@ -16,7 +16,7 @@ from typing import List import tensorflow as tf from .transducer import Transducer -from ..utils.utils import merge_two_last_dims, get_reduced_length +from ..utils import math_util L2 = tf.keras.regularizers.l2(1e-6) @@ -30,7 +30,7 @@ def get_activation(activation: str = "silu"): class Reshape(tf.keras.layers.Layer): - def call(self, inputs): return merge_two_last_dims(inputs) + def call(self, inputs): return math_util.merge_two_last_dims(inputs) class ConvModule(tf.keras.layers.Layer): @@ -154,7 +154,7 @@ def call(self, inputs, training=False, **kwargs): for conv in self.convs: outputs = conv(outputs, training=training) outputs = self.last_conv(outputs, training=training) - input_length = get_reduced_length(input_length, self.last_conv.strides) + input_length = math_util.get_reduced_length(input_length, self.last_conv.strides) outputs = self.se([outputs, input_length], training=training) if self.residual is not None: res = self.residual(features, training=training) @@ -282,8 +282,11 @@ def recognize(self, tf.Tensor: a batch of decoded transcripts """ encoded = self.encoder([features, input_length], training=False) - return self._perform_greedy_batch(encoded, input_length, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + return self._perform_greedy_batch( + encoded, input_length, + parallel_iterations=parallel_iterations, + swap_memory=swap_memory + ) def recognize_tflite(self, signal, predicted, prediction_states): """ @@ -347,5 +350,8 @@ def recognize_beam(self, tf.Tensor: a batch of decoded transcripts """ encoded = self.encoder([features, input_length], training=False) - return self._perform_beam_search_batch(encoded, input_length, lm, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + return self._perform_beam_search_batch( + encoded, input_length, lm, + parallel_iterations=parallel_iterations, + swap_memory=swap_memory + ) diff --git a/tensorflow_asr/models/ctc.py b/tensorflow_asr/models/ctc.py index 0e12c52c79..a95949544b 100644 --- a/tensorflow_asr/models/ctc.py +++ b/tensorflow_asr/models/ctc.py @@ -15,11 +15,13 @@ from typing import Optional import numpy as np import tensorflow as tf +from tensorflow.keras import mixed_precision as mxp from . import Model from ..featurizers.speech_featurizers import TFSpeechFeaturizer from ..featurizers.text_featurizers import TextFeaturizer -from ..utils.utils import shape_list, get_reduced_length +from ..utils import math_util, shape_util +from ..losses.keras.ctc_losses import CtcLoss class CtcModel(Model): @@ -31,6 +33,49 @@ def _build(self, input_shape, batch_size=None): features = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32) self(features, training=False) + @property + def metrics(self): + return [self.loss_metric] + + def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): + loss = CtcLoss(blank=blank, global_batch_size=global_batch_size) + self.use_loss_scale = use_loss_scale + if self.use_loss_scale: + optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") + self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32) + super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) + + def train_step(self, batch): + x, y_true = batch + with tf.GradientTape() as tape: + logit = self(x["input"], training=True) + y_pred = { + "logit": logit, + "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor) + } + loss = self.loss(y_true, y_pred) + if self.use_loss_scale: + scaled_loss = self.optimizer.get_scaled_loss(loss) + if self.use_loss_scale: + scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights) + gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) + else: + gradients = tape.gradient(loss, self.trainable_weights) + self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) + self.loss_metric.update_state(loss) + return {m.name: m.result() for m in self.metrics} + + def test_step(self, batch): + x, y_true = batch + logit = self(x["input"], training=False) + y_pred = { + "logit": logit, + "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor) + } + loss = self.loss(y_true, y_pred) + self.loss_metric.update_state(loss) + return {m.name: m.result() for m in self.metrics} + def add_featurizers(self, speech_featurizer: TFSpeechFeaturizer, text_featurizer: TextFeaturizer): @@ -67,8 +112,8 @@ def recognize_tflite(self, signal): """ features = self.speech_featurizer.tf_extract(signal) features = tf.expand_dims(features, axis=0) - input_length = shape_list(features)[1] - input_length = get_reduced_length(input_length, self.time_reduction_factor) + input_length = shape_util.shape_list(features)[1] + input_length = math_util.get_reduced_length(input_length, self.time_reduction_factor) input_length = tf.expand_dims(input_length, axis=0) logits = self(features, training=False) probs = tf.nn.softmax(logits) @@ -113,8 +158,8 @@ def recognize_beam_tflite(self, signal): """ features = self.speech_featurizer.tf_extract(signal) features = tf.expand_dims(features, axis=0) - input_length = shape_list(features)[1] - input_length = get_reduced_length(input_length, self.time_reduction_factor) + input_length = shape_util.shape_list(features)[1] + input_length = math_util.get_reduced_length(input_length, self.time_reduction_factor) input_length = tf.expand_dims(input_length, axis=0) logits = self(features, training=False) probs = tf.nn.softmax(logits) diff --git a/tensorflow_asr/models/deepspeech2.py b/tensorflow_asr/models/deepspeech2.py index 6bc99fe1f9..1e855c5ef3 100644 --- a/tensorflow_asr/models/deepspeech2.py +++ b/tensorflow_asr/models/deepspeech2.py @@ -14,14 +14,14 @@ import tensorflow as tf -from ..utils.utils import get_rnn, get_conv, merge_two_last_dims +from ..utils import layer_util, math_util from .layers.row_conv_1d import RowConv1D from .layers.sequence_wise_bn import SequenceBatchNorm from .ctc import CtcModel class Reshape(tf.keras.layers.Layer): - def call(self, inputs): return merge_two_last_dims(inputs) + def call(self, inputs): return math_util.merge_two_last_dims(inputs) class ConvBlock(tf.keras.layers.Layer): @@ -34,7 +34,7 @@ def __init__(self, **kwargs): super(ConvBlock, self).__init__(**kwargs) - CNN = get_conv(conv_type) + CNN = layer_util.get_conv(conv_type) self.conv = CNN(filters=filters, kernel_size=kernels, strides=strides, padding="same", dtype=tf.float32, name=f"{self.name}_{conv_type}") @@ -118,7 +118,7 @@ def __init__(self, **kwargs): super(RnnBlock, self).__init__(**kwargs) - RNN = get_rnn(rnn_type) + RNN = layer_util.get_rnn(rnn_type) self.rnn = RNN(units, dropout=dropout, return_sequences=True, use_bias=True, name=f"{self.name}_{rnn_type}") if bidirectional: diff --git a/tensorflow_asr/models/jasper.py b/tensorflow_asr/models/jasper.py index 70709da644..a8b0780403 100644 --- a/tensorflow_asr/models/jasper.py +++ b/tensorflow_asr/models/jasper.py @@ -14,12 +14,12 @@ import tensorflow as tf -from ..utils.utils import merge_two_last_dims +from ..utils import math_util from .ctc import CtcModel class Reshape(tf.keras.layers.Layer): - def call(self, inputs): return merge_two_last_dims(inputs) + def call(self, inputs): return math_util.merge_two_last_dims(inputs) class JasperSubBlock(tf.keras.layers.Layer): diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/streaming_transducer.py index 266db0e13e..ba793126e2 100644 --- a/tensorflow_asr/models/streaming_transducer.py +++ b/tensorflow_asr/models/streaming_transducer.py @@ -17,11 +17,11 @@ from .layers.subsampling import TimeReduction from .transducer import Transducer -from ..utils.utils import get_rnn, merge_two_last_dims, shape_list +from ..utils import layer_util, math_util, shape_util class Reshape(tf.keras.layers.Layer): - def call(self, inputs): return merge_two_last_dims(inputs) + def call(self, inputs): return math_util.merge_two_last_dims(inputs) class StreamingTransducerBlock(tf.keras.Model): @@ -41,7 +41,7 @@ def __init__(self, else: self.reduction = None - RNN = get_rnn(rnn_type) + RNN = layer_util.get_rnn(rnn_type) self.rnn = RNN( units=rnn_units, return_sequences=True, name=f"{self.name}_{rnn_type}", return_state=True, @@ -269,7 +269,7 @@ def recognize(self, Returns: tf.Tensor: a batch of decoded transcripts """ - batch_size, _, _, _ = shape_list(features) + batch_size, _, _, _ = shape_util.shape_list(features) encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) return self._perform_greedy_batch(encoded, input_length, parallel_iterations=parallel_iterations, swap_memory=swap_memory) @@ -336,7 +336,7 @@ def recognize_beam(self, Returns: tf.Tensor: a batch of decoded transcripts """ - batch_size, _, _, _ = shape_list(features) + batch_size, _, _, _ = shape_util.shape_list(features) encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) return self._perform_beam_search_batch(encoded, input_length, lm, parallel_iterations=parallel_iterations, swap_memory=swap_memory) diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer.py index 6195e2ee7d..efd3c4d55e 100755 --- a/tensorflow_asr/models/transducer.py +++ b/tensorflow_asr/models/transducer.py @@ -15,12 +15,14 @@ import collections import tensorflow as tf +from tensorflow.keras import mixed_precision as mxp from . import Model -from ..utils.utils import get_rnn, shape_list, count_non_blank, pad_prediction_tfarray +from ..utils import math_util, layer_util, shape_util from ..featurizers.speech_featurizers import SpeechFeaturizer from ..featurizers.text_featurizers import TextFeaturizer from .layers.embedding import Embedding +from ..losses.keras.rnnt_losses import RnntLoss Hypothesis = collections.namedtuple("Hypothesis", ("index", "prediction", "states")) @@ -47,7 +49,7 @@ def __init__(self, regularizer=kernel_regularizer, name=f"{name}_embedding") self.do = tf.keras.layers.Dropout(embed_dropout, name=f"{name}_dropout") # Initialize rnn layers - RNN = get_rnn(rnn_type) + RNN = layer_util.get_rnn(rnn_type) self.rnns = [] for i in range(num_rnns): rnn = RNN( @@ -302,12 +304,21 @@ def __init__(self, ) self.time_reduction_factor = 1 + @property + def metrics(self): + return [self.loss_metric] + def _build(self, input_shape, prediction_shape=[None], batch_size=None): inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32) input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32) pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) - self([inputs, input_length, pred, pred_length], training=False) + self({ + "input": inputs, + "input_length": input_length, + "prediction": pred, + "prediction_length": pred_length + }, training=False) def summary(self, line_length=None, **kwargs): if self.encoder is not None: self.encoder.summary(line_length=line_length, **kwargs) @@ -328,25 +339,25 @@ def add_featurizers(self, self.speech_featurizer = speech_featurizer self.text_featurizer = text_featurizer - def call(self, inputs, training=False, **kwargs): - """ - Transducer Model call function - Args: - features: audio features in shape [B, T, F, C] - input_length: features time length in shape [B] - prediction: predicted sequence of ids, in shape [B, U] - prediction_length: predicted sequence of ids length in shape [B] - training: python boolean - **kwargs: sth else + def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): + loss = RnntLoss(blank=blank, global_batch_size=global_batch_size) + self.use_loss_scale = use_loss_scale + if self.use_loss_scale: + optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") + self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32) + super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) - Returns: - `logits` with shape [B, T, U, vocab] - """ - features, _, prediction, prediction_length = inputs + def call(self, inputs, training=False, **kwargs): + features = inputs["input"] + prediction = inputs["prediction"] + prediction_length = inputs["prediction_length"] enc = self.encoder(features, training=training, **kwargs) pred = self.predict_net([prediction, prediction_length], training=training, **kwargs) outputs = self.joint_net([enc, pred], training=training, **kwargs) - return outputs + return { + "logit": outputs, + "logit_length": math_util.get_reduced_length(inputs["input_length"], self.time_reduction_factor) + } # -------------------------------- INFERENCES------------------------------------- @@ -485,7 +496,7 @@ def body(batch, decoded): parallel_iterations=parallel_iterations, swap_memory=True, ) - decoded = pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank) + decoded = math_util.pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank) return self.text_featurizer.iextract(decoded.stack()) def _perform_greedy(self, @@ -641,7 +652,7 @@ def body(batch, decoded): parallel_iterations=parallel_iterations, swap_memory=True, ) - decoded = pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank) + decoded = math_util.pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank) return self.text_featurizer.iextract(decoded.stack()) def _perform_beam_search(self, @@ -661,20 +672,32 @@ def _perform_beam_search(self, def initialize_beam(dynamic=False): return BeamHypothesis( score=tf.TensorArray( - dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, - element_shape=tf.TensorShape([]), clear_after_read=False + dtype=tf.float32, + size=beam_width if not dynamic else 0, + dynamic_size=dynamic, + element_shape=tf.TensorShape([]), + clear_after_read=False ), indices=tf.TensorArray( - dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, - element_shape=tf.TensorShape([]), clear_after_read=False + dtype=tf.int32, + size=beam_width if not dynamic else 0, + dynamic_size=dynamic, + element_shape=tf.TensorShape([]), + clear_after_read=False ), prediction=tf.TensorArray( - dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, - element_shape=None, clear_after_read=False + dtype=tf.int32, + size=beam_width if not dynamic else 0, + dynamic_size=dynamic, + element_shape=None, + clear_after_read=False ), states=tf.TensorArray( - dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, - element_shape=tf.TensorShape(shape_list(self.predict_net.get_initial_state())), clear_after_read=False + dtype=tf.float32, + size=beam_width if not dynamic else 0, + dynamic_size=dynamic, + element_shape=tf.TensorShape(shape_util.shape_list(self.predict_net.get_initial_state())), + clear_after_read=False ), ) @@ -694,7 +717,11 @@ def body(time, total, B): score=A.score.unstack(B.score.stack()), indices=A.indices.unstack(B.indices.stack()), prediction=A.prediction.unstack( - pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack()), + math_util.pad_prediction_tfarray( + B.prediction, + blank=self.text_featurizer.blank + ).stack() + ), states=A.states.unstack(B.states.stack()), ) A_i = tf.constant(0, tf.int32) @@ -710,7 +737,9 @@ def beam_body(beam, beam_width, A, A_i, B): y_hat_score = y_hat_score[0] y_hat_index = tf.gather_nd(A.indices.stack(), y_hat_score_index) y_hat_prediction = tf.gather_nd( - pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), y_hat_score_index) + math_util.pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), + y_hat_score_index + ) y_hat_states = tf.gather_nd(A.states.stack(), y_hat_score_index) # remove y_hat from A @@ -720,8 +749,12 @@ def beam_body(beam, beam_width, A, A_i, B): A = BeamHypothesis( score=A.score.unstack(tf.gather_nd(A.score.stack(), remain_indices)), indices=A.indices.unstack(tf.gather_nd(A.indices.stack(), remain_indices)), - prediction=A.prediction.unstack(tf.gather_nd( - pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), remain_indices)), + prediction=A.prediction.unstack( + tf.gather_nd( + math_util.pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), + remain_indices + ) + ), states=A.states.unstack(tf.gather_nd(A.states.stack(), remain_indices)), ) A_i = tf.cond(tf.equal(A_i, 0), true_fn=lambda: A_i, false_fn=lambda: A_i - 1) @@ -747,7 +780,7 @@ def true_fn(): ) def false_fn(): - scatter_index = count_non_blank(y_hat_prediction, blank=self.text_featurizer.blank) + scatter_index = math_util.count_non_blank(y_hat_prediction, blank=self.text_featurizer.blank) updated_prediction = tf.tensor_scatter_nd_update( y_hat_prediction, indices=tf.reshape(scatter_index, [1, 1]), @@ -797,9 +830,9 @@ def false_fn(): ) scores = B.score.stack() - prediction = pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack() + prediction = math_util.pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack() if self.text_featurizer.decoder_config.norm_score: - prediction_lengths = count_non_blank(prediction, blank=self.text_featurizer.blank, axis=1) + prediction_lengths = math_util.count_non_blank(prediction, blank=self.text_featurizer.blank, axis=1) scores /= tf.cast(prediction_lengths, dtype=scores.dtype) y_hat_score, y_hat_score_index = tf.math.top_k(scores, k=1) diff --git a/tensorflow_asr/utils/__init__.py b/tensorflow_asr/utils/__init__.py index e7becd8f27..e69de29bb2 100644 --- a/tensorflow_asr/utils/__init__.py +++ b/tensorflow_asr/utils/__init__.py @@ -1,74 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def setup_environment(): # Set memory growth and only log ERRORs - """ Setting tensorflow running environment """ - import warnings - - warnings.simplefilter("ignore") - - import tensorflow as tf - - tf.get_logger().setLevel("ERROR") - - tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) - - -def setup_devices(devices, cpu=False): - """Setting visible devices - - Args: - devices (list): list of visible devices' indices - """ - import tensorflow as tf - - if cpu: - cpus = tf.config.list_physical_devices("CPU") - tf.config.set_visible_devices(cpus, "CPU") - else: - gpus = tf.config.list_physical_devices("GPU") - if gpus: - visible_gpus = [gpus[i] for i in devices] - tf.config.set_visible_devices(visible_gpus, "GPU") - print("Run on", len(visible_gpus), "Physical GPUs") - - -def setup_strategy(devices): - """Setting mirrored strategy for training - - Args: - devices (list): list of visible devices' indices - - Returns: - tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus - """ - import tensorflow as tf - - setup_devices(devices) - - return tf.distribute.MirroredStrategy() - - -def setup_tpu(tpu_address=None): - import tensorflow as tf - - if tpu_address is None: - resolver = tf.distribute.cluster_resolver.TPUClusterResolver() - else: - resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address) - tf.config.experimental_connect_to_cluster(resolver) - tf.tpu.experimental.initialize_tpu_system(resolver) - print("All TPUs: ", tf.config.list_logical_devices('TPU')) - return tf.distribute.experimental.TPUStrategy(resolver) diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py new file mode 100644 index 0000000000..2bf4970415 --- /dev/null +++ b/tensorflow_asr/utils/env_util.py @@ -0,0 +1,77 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + + +def setup_environment(): # Set memory growth and only log ERRORs + """ Setting tensorflow running environment """ + import warnings + warnings.simplefilter("ignore") + tf.get_logger().setLevel("ERROR") + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) + + +def setup_devices(devices, cpu=False): + """Setting visible devices + + Args: + devices (list): list of visible devices' indices + """ + if cpu: + cpus = tf.config.list_physical_devices("CPU") + tf.config.set_visible_devices(cpus, "CPU") + else: + gpus = tf.config.list_physical_devices("GPU") + if gpus: + visible_gpus = [gpus[i] for i in devices] + tf.config.set_visible_devices(visible_gpus, "GPU") + print("Run on", len(visible_gpus), "Physical GPUs") + + +def setup_strategy(devices): + """Setting mirrored strategy for training + + Args: + devices (list): list of visible devices' indices + + Returns: + tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus + """ + setup_devices(devices) + return tf.distribute.MirroredStrategy() + + +def setup_tpu(tpu_address=None): + if tpu_address is None: + resolver = tf.distribute.cluster_resolver.TPUClusterResolver() + else: + resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address) + tf.config.experimental_connect_to_cluster(resolver) + tf.tpu.experimental.initialize_tpu_system(resolver) + print("All TPUs: ", tf.config.list_logical_devices('TPU')) + return tf.distribute.experimental.TPUStrategy(resolver) + + +def has_gpu_or_tpu(): + gpus = tf.config.list_logical_devices("GPU") + tpus = tf.config.list_logical_devices("TPU") + if len(gpus) == 0 and len(tpus) == 0: return False + return True + + +def has_tpu(): + tpus = tf.config.list_logical_devices("TPU") + if len(tpus) == 0: return False + return True diff --git a/tensorflow_asr/utils/feature_util.py b/tensorflow_asr/utils/feature_util.py new file mode 100644 index 0000000000..0d8a294ce1 --- /dev/null +++ b/tensorflow_asr/utils/feature_util.py @@ -0,0 +1,27 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + + +def float_feature(list_of_floats): + return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats)) + + +def int64_feature(list_of_ints): + return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints)) + + +def bytestring_feature(list_of_bytestrings): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings)) diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py new file mode 100644 index 0000000000..c9d1c867d0 --- /dev/null +++ b/tensorflow_asr/utils/file_util.py @@ -0,0 +1,57 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from typing import Union, List +import tensorflow as tf + + +def is_hdf5_filepath(filepath): + return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5')) + + +def is_cloud_path(path): + """ Check if the path is on cloud (which requires tf.io.gfile) + + Args: + path (str): Path to directory or file + + Returns: + bool: True if path is on cloud, False otherwise + """ + return bool(re.match(r"^[a-z]+://", path)) + + +def preprocess_paths(paths: Union[List, str]): + """Expand the path to the root "/" + + Args: + paths (Union[List, str]): A path or list of paths + + Returns: + Union[List, str]: A processed path or list of paths, return None if it's not path + """ + if isinstance(paths, list): + return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths] + elif isinstance(paths, str): + return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths)) + else: + return None + + +def read_bytes(path: str) -> tf.Tensor: + with tf.io.gfile.GFile(path, "rb") as f: + content = f.read() + return tf.convert_to_tensor(content, dtype=tf.string) diff --git a/tensorflow_asr/utils/layer_util.py b/tensorflow_asr/utils/layer_util.py new file mode 100644 index 0000000000..6e2647f581 --- /dev/null +++ b/tensorflow_asr/utils/layer_util.py @@ -0,0 +1,29 @@ + +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + + +def get_rnn(rnn_type: str): + assert rnn_type in ["lstm", "gru", "rnn"] + if rnn_type == "lstm": return tf.keras.layers.LSTM + if rnn_type == "gru": return tf.keras.layers.GRU + return tf.keras.layers.SimpleRNN + + +def get_conv(conv_type): + assert conv_type in ["conv1d", "conv2d"] + if conv_type == "conv1d": return tf.keras.layers.Conv1D + return tf.keras.layers.Conv2D diff --git a/tensorflow_asr/utils/utils.py b/tensorflow_asr/utils/math_util.py old mode 100755 new mode 100644 similarity index 53% rename from tensorflow_asr/utils/utils.py rename to tensorflow_asr/utils/math_util.py index fef55a0cf6..451a9bcb03 --- a/tensorflow_asr/utils/utils.py +++ b/tensorflow_asr/utils/math_util.py @@ -12,74 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re -import os -import sys import math -from typing import Union, List - import numpy as np import tensorflow as tf +from . import shape_util -def float_feature(list_of_floats): - return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats)) - - -def int64_feature(list_of_ints): - return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints)) - - -def bytestring_feature(list_of_bytestrings): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings)) - - -def append_default_keys_dict(default_dict, dest_dict): - if not dest_dict: - return default_dict - for key in default_dict.keys(): - if key not in dest_dict.keys(): - dest_dict[key] = default_dict[key] - return dest_dict - - -def check_key_in_dict(dictionary, keys): - for key in keys: - if key not in dictionary.keys(): - raise ValueError("{} must be defined".format(key)) +def log10(x): + numerator = tf.math.log(x) + denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype)) + return numerator / denominator -def is_hdf5_filepath(filepath): - return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5')) - - -def is_cloud_path(path): - """ Check if the path is on cloud (which requires tf.io.gfile) - - Args: - path (str): Path to directory or file - - Returns: - bool: True if path is on cloud, False otherwise - """ - return bool(re.match(r"^[a-z]+://", path)) - - -def preprocess_paths(paths: Union[List, str]): - """Expand the path to the root "/" - - Args: - paths (Union[List, str]): A path or list of paths - Returns: - Union[List, str]: A processed path or list of paths, return None if it's not path - """ - if isinstance(paths, list): - return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths] - elif isinstance(paths, str): - return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths)) - else: - return None +def get_num_batches(samples, batch_size, drop_remainders=True): + if samples is None or batch_size is None: return None + if drop_remainders: return math.floor(float(samples) / float(batch_size)) + return math.ceil(float(samples) / float(batch_size)) def nan_to_zero(input_tensor): @@ -91,65 +40,23 @@ def bytes_to_string(array: np.ndarray, encoding: str = "utf-8"): return [transcript.decode(encoding) for transcript in array] -def get_num_batches(samples, batch_size, drop_remainders=True): - if samples is None or batch_size is None: return None - if drop_remainders: return math.floor(float(samples) / float(batch_size)) - return math.ceil(float(samples) / float(batch_size)) - - -def merge_two_last_dims(x): - b, _, f, c = shape_list(x) - return tf.reshape(x, shape=[b, -1, f * c]) - - -def get_rnn(rnn_type: str): - assert rnn_type in ["lstm", "gru", "rnn"] - if rnn_type.lower() == "lstm": return tf.keras.layers.LSTM - if rnn_type.lower() == "gru": return tf.keras.layers.GRU - return tf.keras.layers.SimpleRNN - - -def get_conv(conv_type): - assert conv_type in ["conv1d", "conv2d"] - - if conv_type == "conv1d": - return tf.keras.layers.Conv1D - - return tf.keras.layers.Conv2D - - -def print_one_line(*args): - tf.print("\033[K", end="") - tf.print("\r", *args, sep="", end=" ", output_stream=sys.stdout) - - -def read_bytes(path: str) -> tf.Tensor: - with tf.io.gfile.GFile(path, "rb") as f: - content = f.read() - return tf.convert_to_tensor(content, dtype=tf.string) - - -def shape_list(x, out_type=tf.int32): - """Deal with dynamic shape in tensorflow cleanly.""" - static = x.shape.as_list() - dynamic = tf.shape(x, out_type=out_type) - return [dynamic[i] if s is None else s for i, s in enumerate(static)] +def get_reduced_length(length, reduction_factor): + return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32) -def get_shape_invariants(tensor): - shapes = shape_list(tensor) - return tf.TensorShape([i if isinstance(i, int) else None for i in shapes]) +def count_non_blank(tensor: tf.Tensor, blank: int or tf.Tensor = 0, axis=None): + return tf.reduce_sum(tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), axis=axis) -def get_float_spec(tensor): - shape = get_shape_invariants(tensor) - return tf.TensorSpec(shape, dtype=tf.float32) +def merge_two_last_dims(x): + b, _, f, c = shape_util.shape_list(x) + return tf.reshape(x, shape=[b, -1, f * c]) def merge_repeated(yseqs, blank=0): result = tf.reshape(yseqs[0], [1]) - U = shape_list(yseqs)[0] + U = shape_util.shape_list(yseqs)[0] i = tf.constant(1, dtype=tf.int32) def _cond(i, result, yseqs, U): return tf.less(i, U) @@ -171,34 +78,7 @@ def _body(i, result, yseqs, U): ) ) - return tf.pad(result, [[U - shape_list(result)[0], 0]], constant_values=blank) - - -def log10(x): - numerator = tf.math.log(x) - denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype)) - return numerator / denominator - - -def get_reduced_length(length, reduction_factor): - return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32) - - -def count_non_blank(tensor: tf.Tensor, blank: int or tf.Tensor = 0, axis=None): - return tf.reduce_sum(tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), axis=axis) - - -def has_gpu_or_tpu(): - gpus = tf.config.list_logical_devices("GPU") - tpus = tf.config.list_logical_devices("TPU") - if len(gpus) == 0 and len(tpus) == 0: return False - return True - - -def has_tpu(): - tpus = tf.config.list_logical_devices("TPU") - if len(tpus) == 0: return False - return True + return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]], constant_values=blank) def find_max_length_prediction_tfarray(tfarray: tf.TensorArray) -> tf.Tensor: diff --git a/tensorflow_asr/utils/metrics.py b/tensorflow_asr/utils/metric_util.py similarity index 67% rename from tensorflow_asr/utils/metrics.py rename to tensorflow_asr/utils/metric_util.py index efb59ed452..c26dcc451f 100644 --- a/tensorflow_asr/utils/metrics.py +++ b/tensorflow_asr/utils/metric_util.py @@ -13,14 +13,15 @@ # limitations under the License. from typing import Tuple -import tensorflow as tf from nltk.metrics import distance -from .utils import bytes_to_string +import tensorflow as tf +from . import math_util -def _wer(decode, target): - decode = bytes_to_string(decode) - target = bytes_to_string(target) + +def execute_wer(decode, target): + decode = math_util.bytes_to_string(decode) + target = math_util.bytes_to_string(target) dis = 0.0 length = 0.0 for dec, tar in zip(decode, target): @@ -35,7 +36,7 @@ def _wer(decode, target): return tf.convert_to_tensor(dis, tf.float32), tf.convert_to_tensor(length, tf.float32) -def wer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: +def wer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: """Word Error Rate Args: @@ -45,12 +46,12 @@ def wer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: Returns: tuple: a tuple of tf.Tensor of (edit distances, number of words) of each text """ - return tf.numpy_function(_wer, inp=[_decode, _target], Tout=[tf.float32, tf.float32]) + return tf.numpy_function(execute_wer, inp=[decode, target], Tout=[tf.float32, tf.float32]) -def _cer(decode, target): - decode = bytes_to_string(decode) - target = bytes_to_string(target) +def execute_cer(decode, target): + decode = math_util.bytes_to_string(decode) + target = math_util.bytes_to_string(target) dis = 0 length = 0 for dec, tar in zip(decode, target): @@ -59,7 +60,7 @@ def _cer(decode, target): return tf.convert_to_tensor(dis, tf.float32), tf.convert_to_tensor(length, tf.float32) -def cer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: +def cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: """Character Error Rate Args: @@ -69,7 +70,7 @@ def cer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: Returns: tuple: a tuple of tf.Tensor of (edit distances, number of characters) of each text """ - return tf.numpy_function(_cer, inp=[_decode, _target], Tout=[tf.float32, tf.float32]) + return tf.numpy_function(execute_cer, inp=[decode, target], Tout=[tf.float32, tf.float32]) def tf_cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: @@ -87,21 +88,3 @@ def tf_cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: distances = tf.edit_distance(decode.to_sparse(), target.to_sparse(), normalize=False) # [B] lengths = tf.cast(target.row_lengths(axis=1), dtype=tf.float32) # [B] return tf.reduce_sum(distances), tf.reduce_sum(lengths) - - -class ErrorRate(tf.keras.metrics.Metric): - """ Metric for WER and CER """ - - def __init__(self, func, name="error_rate", **kwargs): - super(ErrorRate, self).__init__(name=name, **kwargs) - self.numerator = self.add_weight(name=f"{name}_numerator", initializer="zeros") - self.denominator = self.add_weight(name=f"{name}_denominator", initializer="zeros") - self.func = func - - def update_state(self, decode: tf.Tensor, target: tf.Tensor): - n, d = self.func(decode, target) - self.numerator.assign_add(n) - self.denominator.assign_add(d) - - def result(self): - return tf.math.divide_no_nan(self.numerator, self.denominator) * 100 diff --git a/tensorflow_asr/utils/shape_util.py b/tensorflow_asr/utils/shape_util.py new file mode 100644 index 0000000000..d482621f0c --- /dev/null +++ b/tensorflow_asr/utils/shape_util.py @@ -0,0 +1,32 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + + +def shape_list(x, out_type=tf.int32): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x, out_type=out_type) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +def get_shape_invariants(tensor): + shapes = shape_list(tensor) + return tf.TensorShape([i if isinstance(i, int) else None for i in shapes]) + + +def get_float_spec(tensor): + shape = get_shape_invariants(tensor) + return tf.TensorSpec(shape, dtype=tf.float32) From 32970d767fa5e637733ce09af4a955a4d285dac0 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 10 Apr 2021 17:23:01 +0700 Subject: [PATCH 02/13] :rocket: refactor models --- tensorflow_asr/losses/__init__.py | 17 --- .../{keras/ctc_losses.py => ctc_loss.py} | 30 +++- tensorflow_asr/losses/ctc_losses.py | 26 ---- tensorflow_asr/losses/keras/__init__.py | 17 --- tensorflow_asr/losses/keras/rnnt_losses.py | 31 ---- .../losses/{rnnt_losses.py => rnnt_loss.py} | 31 +++- tensorflow_asr/models/base_model.py | 144 ++++++++++++------ tensorflow_asr/models/ctc/__init__.py | 0 tensorflow_asr/models/{ => ctc}/ctc.py | 100 ++++++------ .../models/{ => ctc}/deepspeech2.py | 81 +++++++--- tensorflow_asr/models/{ => ctc}/jasper.py | 96 +++++++++--- tensorflow_asr/models/transducer/__init__.py | 0 .../models/{ => transducer}/conformer.py | 10 +- .../models/{ => transducer}/contextnet.py | 2 +- .../rnn_transducer.py} | 22 +-- .../models/{ => transducer}/transducer.py | 83 +++++----- tensorflow_asr/optimizers/schedules.py | 8 +- tensorflow_asr/utils/data_util.py | 43 ++++++ tensorflow_asr/utils/file_util.py | 21 +++ 19 files changed, 447 insertions(+), 315 deletions(-) rename tensorflow_asr/losses/{keras/ctc_losses.py => ctc_loss.py} (58%) delete mode 100644 tensorflow_asr/losses/ctc_losses.py delete mode 100644 tensorflow_asr/losses/keras/__init__.py delete mode 100644 tensorflow_asr/losses/keras/rnnt_losses.py rename tensorflow_asr/losses/{rnnt_losses.py => rnnt_loss.py} (93%) create mode 100644 tensorflow_asr/models/ctc/__init__.py rename tensorflow_asr/models/{ => ctc}/ctc.py (69%) rename tensorflow_asr/models/{ => ctc}/deepspeech2.py (84%) rename tensorflow_asr/models/{ => ctc}/jasper.py (74%) create mode 100644 tensorflow_asr/models/transducer/__init__.py rename tensorflow_asr/models/{ => transducer}/conformer.py (98%) mode change 100755 => 100644 rename tensorflow_asr/models/{ => transducer}/contextnet.py (99%) rename tensorflow_asr/models/{streaming_transducer.py => transducer/rnn_transducer.py} (96%) rename tensorflow_asr/models/{ => transducer}/transducer.py (94%) mode change 100755 => 100644 create mode 100644 tensorflow_asr/utils/data_util.py diff --git a/tensorflow_asr/losses/__init__.py b/tensorflow_asr/losses/__init__.py index f9ae63d25d..e69de29bb2 100644 --- a/tensorflow_asr/losses/__init__.py +++ b/tensorflow_asr/losses/__init__.py @@ -1,17 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .ctc_losses import ctc_loss -from .rnnt_losses import rnnt_loss -__all__ = ['ctc_loss', 'rnnt_loss'] diff --git a/tensorflow_asr/losses/keras/ctc_losses.py b/tensorflow_asr/losses/ctc_loss.py similarity index 58% rename from tensorflow_asr/losses/keras/ctc_losses.py rename to tensorflow_asr/losses/ctc_loss.py index 9b46fa6670..6808c57b15 100644 --- a/tensorflow_asr/losses/keras/ctc_losses.py +++ b/tensorflow_asr/losses/ctc_loss.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import tensorflow as tf -from .. import ctc_loss class CtcLoss(tf.keras.losses.Loss): @@ -23,9 +21,27 @@ def __init__(self, blank=0, global_batch_size=None, name=None): self.global_batch_size = global_batch_size def call(self, y_true, y_pred): - logits = y_pred["logit"] - logit_length = y_pred["logit_length"] - labels = y_true["label"] - label_length = y_true["label_length"] - loss = ctc_loss(labels, logits, logit_length, label_length, blank=self.blank) + logits, logits_length = y_pred.values() + labels, labels_length = y_true.values() + loss = ctc_loss( + y_pred=logits, + input_length=logits_length, + y_true=labels, + label_length=labels_length, + blank=self.blank, + name=self.name + ) return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size) + + +@tf.function +def ctc_loss(y_true, y_pred, input_length, label_length, blank, name=None): + return tf.nn.ctc_loss( + labels=tf.cast(y_true, tf.int32), + logit_length=tf.cast(input_length, tf.int32), + logits=tf.cast(y_pred, tf.float32), + label_length=tf.cast(label_length, tf.int32), + logits_time_major=False, + blank_index=blank, + name=name + ) diff --git a/tensorflow_asr/losses/ctc_losses.py b/tensorflow_asr/losses/ctc_losses.py deleted file mode 100644 index d2eccb0ad9..0000000000 --- a/tensorflow_asr/losses/ctc_losses.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import tensorflow as tf - - -@tf.function -def ctc_loss(y_true, y_pred, input_length, label_length, blank): - return tf.nn.ctc_loss( - labels=tf.cast(y_true, tf.int32), - logit_length=tf.cast(input_length, tf.int32), - logits=tf.cast(y_pred, tf.float32), - label_length=tf.cast(label_length, tf.int32), - logits_time_major=False, - blank_index=blank - ) diff --git a/tensorflow_asr/losses/keras/__init__.py b/tensorflow_asr/losses/keras/__init__.py deleted file mode 100644 index 4b667418c3..0000000000 --- a/tensorflow_asr/losses/keras/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .rnnt_losses import RnntLoss -from .ctc_losses import CtcLoss -__all__ = ['RnntLoss', 'CtcLoss'] diff --git a/tensorflow_asr/losses/keras/rnnt_losses.py b/tensorflow_asr/losses/keras/rnnt_losses.py deleted file mode 100644 index 14e0915e55..0000000000 --- a/tensorflow_asr/losses/keras/rnnt_losses.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -from .. import rnnt_loss - - -class RnntLoss(tf.keras.losses.Loss): - def __init__(self, blank=0, global_batch_size=None, name=None): - super(RnntLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE, name=name) - self.blank = blank - self.global_batch_size = global_batch_size - - def call(self, y_true, y_pred): - logits = y_pred["logit"] - logit_length = y_pred["logit_length"] - labels = y_true["label"] - label_length = y_true["label_length"] - loss = rnnt_loss(logits, labels, label_length, logit_length, blank=self.blank) - return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size) diff --git a/tensorflow_asr/losses/rnnt_losses.py b/tensorflow_asr/losses/rnnt_loss.py similarity index 93% rename from tensorflow_asr/losses/rnnt_losses.py rename to tensorflow_asr/losses/rnnt_loss.py index e8a2486a6e..646ec4586f 100644 --- a/tensorflow_asr/losses/rnnt_losses.py +++ b/tensorflow_asr/losses/rnnt_loss.py @@ -15,9 +15,11 @@ import tensorflow as tf from tensorflow.python.ops.gen_array_ops import matrix_diag_part_v2 -from ..utils.utils import has_gpu_or_tpu +from ..utils import env_util -use_cpu = not has_gpu_or_tpu() +use_cpu = not env_util.has_gpu_or_tpu() + +LOG_0 = float("-inf") try: from warprnnt_tensorflow import rnnt_loss as warp_rnnt_loss @@ -28,6 +30,27 @@ use_warprnnt = False +class RnntLoss(tf.keras.losses.Loss): + def __init__(self, blank=0, global_batch_size=None, name=None): + super(RnntLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE, name=name) + self.blank = blank + self.global_batch_size = global_batch_size + + def call(self, y_true, y_pred): + logits, logits_length = y_pred.values() + labels, labels_length = y_true.values() + loss = rnnt_loss( + logits=logits, + logit_length=logits_length, + labels=labels, + label_length=labels_length, + blank=self.blank, + name=self.name + ) + return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size) + + +@tf.function def rnnt_loss(logits, labels, label_length, logit_length, blank=0, name=None): if use_warprnnt: return rnnt_loss_warprnnt(logits=logits, labels=labels, @@ -36,7 +59,6 @@ def rnnt_loss(logits, labels, label_length, logit_length, blank=0, name=None): return rnnt_loss_tf(logits=logits, labels=labels, label_length=label_length, logit_length=logit_length, name=name) -@tf.function def rnnt_loss_warprnnt(logits, labels, label_length, logit_length, blank=0): if not tf.config.list_physical_devices('GPU'): logits = tf.nn.log_softmax(logits) @@ -50,9 +72,6 @@ def rnnt_loss_warprnnt(logits, labels, label_length, logit_length, blank=0): return loss -LOG_0 = float("-inf") - - def nan_to_zero(input_tensor): return tf.where(tf.math.is_nan(input_tensor), tf.zeros_like(input_tensor), input_tensor) diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py index c545577abc..b8378410e2 100644 --- a/tensorflow_asr/models/base_model.py +++ b/tensorflow_asr/models/base_model.py @@ -12,64 +12,112 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import abc -import tempfile import tensorflow as tf +from tensorflow.keras import mixed_precision as mxp -from ..utils import file_util - - -class Model(tf.keras.Model): - def __init__(self, name, **kwargs): - super(Model, self).__init__(name=name, **kwargs) - - def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None, - signatures=None, options=None, save_traces=True): - if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - super(Model, self).save( - tmp.name, overwrite=overwrite, include_optimizer=include_optimizer, - save_format=save_format, signatures=signatures, options=options, save_traces=save_traces - ) - tf.io.gfile.copy(tmp.name, filepath, overwrite=True) - else: - super(Model, self).save( - filepath, overwrite=overwrite, include_optimizer=include_optimizer, - save_format=save_format, signatures=signatures, options=options, save_traces=save_traces +from ..utils import file_util, env_util + + +class BaseModel(tf.keras.Model): + def save(self, + filepath, + overwrite=True, + include_optimizer=True, + save_format=None, + signatures=None, + options=None, + save_traces=True): + with file_util.save_file(filepath) as path: + super().save( + filepath=path, + overwrite=overwrite, + include_optimizer=include_optimizer, + save_format=save_format, + signatures=signatures, + options=options, + save_traces=save_traces + ) + + def save_weights(self, + filepath, + overwrite=True, + save_format=None, + options=None): + with file_util.save_file(filepath) as path: + super().save_weights( + filepath=path, + overwrite=overwrite, + save_format=save_format, + options=options ) - def save_weights(self, filepath, overwrite=True, save_format=None, options=None): - if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options) - tf.io.gfile.copy(tmp.name, filepath, overwrite=True) - else: - super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options) - - def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None): - if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath): - _, ext = os.path.splitext(filepath) - with tempfile.NamedTemporaryFile(suffix=ext) as tmp: - tf.io.gfile.copy(filepath, tmp.name, overwrite=True) - super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options) - else: - super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options) - - @abc.abstractmethod + def load_weights(self, + filepath, + by_name=False, + skip_mismatch=False, + options=None): + with file_util.read_file(filepath) as path: + super().load_weights( + filepath=path, + by_name=by_name, + skip_mismatch=skip_mismatch, + options=options + ) + + @property + def metrics(self): + return [self.loss_metric] + def _build(self, *args, **kwargs): raise NotImplementedError() - @abc.abstractmethod - def call(self, inputs, training=False, **kwargs): - raise NotImplementedError() + def compile(self, loss, optimizer, run_eagerly=None, **kwargs): + self.use_loss_scale = False + if not env_util.has_tpu(): + optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") + self.use_loss_scale = True + self.loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) + super().compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) + + # -------------------------------- STEP FUNCTIONS ------------------------------------- + + def train_step(self, batch): + inputs, y_true = batch + with tf.GradientTape() as tape: + y_pred = self(inputs, training=True) + loss = self.loss(y_true, y_pred) + if self.use_loss_scale: + loss = self.optimizer.get_scaled_loss(loss) + gradients = tape.gradient(loss, self.trainable_weights) + if self.use_loss_scale: + gradients = self.optimizer.get_unscaled_gradients(gradients) + self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) + self.loss_metric.update_state(loss) + return {m.name: m.result() for m in self.metrics} + + def test_step(self, batch): + inputs, y_true = batch + y_pred = self(inputs, training=False) + loss = self.loss(y_true, y_pred) + self.loss_metric.update_state(loss) + return {m.name: m.result() for m in self.metrics} + + def predict_step(self, batch): + """ + Args: + batch ([tf.Tensor]): a batch of testing data + + Returns: + [tf.Tensor]: stacked tensor of shape [B, 3] with each row is the text [truth, greedy, beam_search] + """ + inputs, y_true = batch + labels = self.text_featurizer.iextract(y_true) + greedy_decoding = self.recognize(inputs) + beam_search_decoding = self.recognize_beam(inputs) + return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1) - @abc.abstractmethod def recognize(self, features, input_lengths, **kwargs): pass - @abc.abstractmethod def recognize_beam(self, features, input_lengths, **kwargs): pass diff --git a/tensorflow_asr/models/ctc/__init__.py b/tensorflow_asr/models/ctc/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/ctc.py b/tensorflow_asr/models/ctc/ctc.py similarity index 69% rename from tensorflow_asr/models/ctc.py rename to tensorflow_asr/models/ctc/ctc.py index a95949544b..ab0b60da16 100644 --- a/tensorflow_asr/models/ctc.py +++ b/tensorflow_asr/models/ctc/ctc.py @@ -12,69 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Optional, Union import numpy as np import tensorflow as tf -from tensorflow.keras import mixed_precision as mxp -from . import Model -from ..featurizers.speech_featurizers import TFSpeechFeaturizer -from ..featurizers.text_featurizers import TextFeaturizer -from ..utils import math_util, shape_util -from ..losses.keras.ctc_losses import CtcLoss - - -class CtcModel(Model): - def __init__(self, **kwargs): - super(CtcModel, self).__init__(**kwargs) +from ..base_model import BaseModel +from ...featurizers.speech_featurizers import TFSpeechFeaturizer +from ...featurizers.text_featurizers import TextFeaturizer +from ...utils import math_util, shape_util, data_util +from ...losses.ctc_loss import CtcLoss + + +class CtcModel(BaseModel): + def __init__(self, + encoder: tf.keras.Model, + decoder: Union[tf.keras.Model, tf.keras.layers.Layer] = None, + vocabulary_size: int = None, + **kwargs): + super().__init__(**kwargs) + self.encoder = encoder + if decoder is None: + assert vocabulary_size is not None, "vocabulary_size must be set" + self.decoder = tf.keras.layers.Dense(units=vocabulary_size, name=f"{self.name}_logits") + else: + self.decoder = decoder self.time_reduction_factor = 1 - def _build(self, input_shape, batch_size=None): - features = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32) - self(features, training=False) - @property def metrics(self): return [self.loss_metric] - def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): + def _build(self, input_shape, batch_size=None): + inputs = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32) + inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) + self( + data_util.create_inputs( + inputs=inputs, + inputs_length=inputs_length + ), + training=False + ) + + def compile(self, + optimizer, + global_batch_size, + blank=0, + run_eagerly=None, + **kwargs): loss = CtcLoss(blank=blank, global_batch_size=global_batch_size) - self.use_loss_scale = use_loss_scale - if self.use_loss_scale: - optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") - self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32) - super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) - - def train_step(self, batch): - x, y_true = batch - with tf.GradientTape() as tape: - logit = self(x["input"], training=True) - y_pred = { - "logit": logit, - "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor) - } - loss = self.loss(y_true, y_pred) - if self.use_loss_scale: - scaled_loss = self.optimizer.get_scaled_loss(loss) - if self.use_loss_scale: - scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights) - gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) - else: - gradients = tape.gradient(loss, self.trainable_weights) - self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} - - def test_step(self, batch): - x, y_true = batch - logit = self(x["input"], training=False) - y_pred = { - "logit": logit, - "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor) - } - loss = self.loss(y_true, y_pred) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} + super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs) def add_featurizers(self, speech_featurizer: TFSpeechFeaturizer, @@ -83,7 +69,13 @@ def add_featurizers(self, self.text_featurizer = text_featurizer def call(self, inputs, training=False, **kwargs): - raise NotImplementedError() + inputs, inputs_length, _, _ = inputs.values() + logits = self.encoder(inputs, training=training, **kwargs) + logits = self.decoder(logits, training=training, **kwargs) + return data_util.create_logits( + logits=logits, + logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor) + ) # -------------------------------- GREEDY ------------------------------------- diff --git a/tensorflow_asr/models/deepspeech2.py b/tensorflow_asr/models/ctc/deepspeech2.py similarity index 84% rename from tensorflow_asr/models/deepspeech2.py rename to tensorflow_asr/models/ctc/deepspeech2.py index 1e855c5ef3..c8788cbf05 100644 --- a/tensorflow_asr/models/deepspeech2.py +++ b/tensorflow_asr/models/ctc/deepspeech2.py @@ -14,9 +14,9 @@ import tensorflow as tf -from ..utils import layer_util, math_util -from .layers.row_conv_1d import RowConv1D -from .layers.sequence_wise_bn import SequenceBatchNorm +from ...utils import layer_util, math_util +from ..layers.row_conv_1d import RowConv1D +from ..layers.sequence_wise_bn import SequenceBatchNorm from .ctc import CtcModel @@ -210,7 +210,6 @@ def get_config(self): class FcModule(tf.keras.Model): def __init__(self, - vocabulary_size: int, nlayers: int = 0, units: int = 1024, dropout: float = 0.1, @@ -225,28 +224,21 @@ def __init__(self, ) for i in range(nlayers) ] - # Fully connected layer - self.fc = tf.keras.layers.Dense(units=vocabulary_size, - use_bias=True, name=f"{self.name}_fc") - def call(self, inputs, training=False, **kwargs): outputs = inputs for block in self.blocks: outputs = block(outputs, training=training, **kwargs) - outputs = self.fc(outputs, training=training) return outputs def get_config(self): conf = {} for block in self.blocks: conf.update(block.get_config()) - conf.update(self.fc.get_config()) return conf -class DeepSpeech2(CtcModel): +class DeepSpeech2Encoder(tf.keras.Model): def __init__(self, - vocabulary_size: int, conv_type: str = "conv2d", conv_kernels: list = [[11, 41], [11, 21], [11, 21]], conv_strides: list = [[2, 2], [1, 2], [1, 2]], @@ -261,9 +253,9 @@ def __init__(self, fc_nlayers: int = 0, fc_units: int = 1024, fc_dropout: float = 0.1, - name: str = "deepspeech2", + name="deepspeech2_encoder", **kwargs): - super(DeepSpeech2, self).__init__(name=name, **kwargs) + super().__init__(**kwargs) self.conv_module = ConvModule( conv_type=conv_type, @@ -288,27 +280,68 @@ def __init__(self, nlayers=fc_nlayers, units=fc_units, dropout=fc_dropout, - vocabulary_size=vocabulary_size, name=f"{self.name}_fc_module" ) - self.time_reduction_factor = self.conv_module.reduction_factor + def summary(self, line_length=100, **kwargs): + self.conv_module.summary(line_length=line_length, **kwargs) + self.rnn_module.summary(line_length=line_length, **kwargs) + self.fc_module.summary(line_length=line_length, **kwargs) + super().summary(line_length=line_length, **kwargs) - def call(self, inputs, training=False, **kwargs): + def call(self, inputs, training, **kwargs): outputs = self.conv_module(inputs, training=training, **kwargs) outputs = self.rnn_module(outputs, training=training, **kwargs) outputs = self.fc_module(outputs, training=training, **kwargs) return outputs - def summary(self, line_length=100, **kwargs): - self.conv_module.summary(line_length=line_length, **kwargs) - self.rnn_module.summary(line_length=line_length, **kwargs) - self.fc_module.summary(line_length=line_length, **kwargs) - super(DeepSpeech2, self).summary(line_length=line_length, **kwargs) - def get_config(self): - conf = super(DeepSpeech2, self).get_config() + conf = super().get_config() conf.update(self.conv_module.get_config()) conf.update(self.rnn_module.get_config()) conf.update(self.fc_module.get_config()) return conf + + +class DeepSpeech2(CtcModel): + def __init__(self, + vocabulary_size: int, + conv_type: str = "conv2d", + conv_kernels: list = [[11, 41], [11, 21], [11, 21]], + conv_strides: list = [[2, 2], [1, 2], [1, 2]], + conv_filters: list = [32, 32, 96], + conv_dropout: float = 0.1, + rnn_nlayers: int = 5, + rnn_type: str = "lstm", + rnn_units: int = 1024, + rnn_bidirectional: bool = True, + rnn_rowconv: int = 0, + rnn_dropout: float = 0.1, + fc_nlayers: int = 0, + fc_units: int = 1024, + fc_dropout: float = 0.1, + name: str = "deepspeech2", + **kwargs): + super().__init__( + encoder=DeepSpeech2Encoder( + conv_type=conv_type, + conv_kernels=conv_kernels, + conv_strides=conv_strides, + conv_filters=conv_filters, + conv_dropout=conv_dropout, + rnn_nlayers=rnn_nlayers, + rnn_type=rnn_type, + rnn_units=rnn_units, + rnn_bidirectional=rnn_bidirectional, + rnn_rowconv=rnn_rowconv, + rnn_dropout=rnn_dropout, + fc_nlayers=fc_nlayers, + fc_units=fc_units, + fc_dropout=fc_dropout, + name=f"{name}_encoder" + ), + vocabulary_size=vocabulary_size, + name=name, + **kwargs + ) + self.time_reduction_factor = self.encoder.conv_module.reduction_factor diff --git a/tensorflow_asr/models/jasper.py b/tensorflow_asr/models/ctc/jasper.py similarity index 74% rename from tensorflow_asr/models/jasper.py rename to tensorflow_asr/models/ctc/jasper.py index a8b0780403..963391a7bb 100644 --- a/tensorflow_asr/models/jasper.py +++ b/tensorflow_asr/models/ctc/jasper.py @@ -14,7 +14,7 @@ import tensorflow as tf -from ..utils import math_util +from ...utils import math_util from .ctc import CtcModel @@ -195,9 +195,8 @@ def get_config(self): return conf -class Jasper(CtcModel): +class JasperEncoder(tf.keras.Model): def __init__(self, - vocabulary_size: int, dense: bool = False, first_additional_block_channels: int = 256, first_additional_block_kernels: int = 11, @@ -220,9 +219,9 @@ def __init__(self, third_additional_block_dropout: int = 0.4, kernel_regularizer=None, bias_regularizer=None, - name: str = "jasper", + name: str = "jasper_encoder", **kwargs): - super(Jasper, self).__init__(name=name, **kwargs) + super().__init__(name=name, **kwargs) assert len(block_channels) == len(block_kernels) == len(block_dropout) @@ -275,18 +274,6 @@ def __init__(self, name=f"{self.name}_third_block" ) - self.last_block = tf.keras.layers.Conv1D( - filters=vocabulary_size, kernel_size=1, - strides=1, padding="same", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_last_block" - ) - - self.time_reduction_factor = self.first_additional_block.reduction_factor - self.time_reduction_factor *= self.second_additional_block.reduction_factor - self.time_reduction_factor *= self.third_additional_block.reduction_factor - def call(self, inputs, training=False, **kwargs): outputs = self.reshape(inputs) outputs = self.first_additional_block(outputs, training=training, **kwargs) @@ -297,18 +284,85 @@ def call(self, inputs, training=False, **kwargs): outputs = self.second_additional_block(outputs, training=training, **kwargs) outputs = self.third_additional_block(outputs, training=training, **kwargs) - outputs = self.last_block(outputs, training=training, **kwargs) return outputs def summary(self, line_length=100, **kwargs): - super(Jasper, self).summary(line_length=line_length, **kwargs) + super().summary(line_length=line_length, **kwargs) def get_config(self): - conf = self.reshape.get_config() + conf = super().get_config() + conf.update(self.reshape.get_config()) conf.update(self.first_additional_block.get_config()) for block in self.blocks: conf.update(block.get_config()) conf.update(self.second_additional_block.get_config()) conf.update(self.third_additional_block.get_config()) - conf.update(self.last_block.get_config()) return conf + + +class Jasper(CtcModel): + def __init__(self, + vocabulary_size: int, + dense: bool = False, + first_additional_block_channels: int = 256, + first_additional_block_kernels: int = 11, + first_additional_block_strides: int = 2, + first_additional_block_dilation: int = 1, + first_additional_block_dropout: int = 0.2, + nsubblocks: int = 5, + block_channels: list = [256, 384, 512, 640, 768], + block_kernels: list = [11, 13, 17, 21, 25], + block_dropout: list = [0.2, 0.2, 0.2, 0.3, 0.3], + second_additional_block_channels: int = 896, + second_additional_block_kernels: int = 1, + second_additional_block_strides: int = 1, + second_additional_block_dilation: int = 2, + second_additional_block_dropout: int = 0.4, + third_additional_block_channels: int = 1024, + third_additional_block_kernels: int = 1, + third_additional_block_strides: int = 1, + third_additional_block_dilation: int = 1, + third_additional_block_dropout: int = 0.4, + kernel_regularizer=None, + bias_regularizer=None, + name="jasper", + **kwargs): + super().__init__( + encoder=JasperEncoder( + dense=dense, + first_additional_block_channels=first_additional_block_channels, + first_additional_block_kernels=first_additional_block_kernels, + first_additional_block_strides=first_additional_block_strides, + first_additional_block_dilation=first_additional_block_dilation, + first_additional_block_dropout=first_additional_block_dropout, + nsubblocks=nsubblocks, + block_channels=block_channels, + block_kernels=block_kernels, + block_dropout=block_dropout, + second_additional_block_channels=second_additional_block_channels, + second_additional_block_kernels=second_additional_block_kernels, + second_additional_block_strides=second_additional_block_strides, + second_additional_block_dilation=second_additional_block_dilation, + second_additional_block_dropout=second_additional_block_dropout, + third_additional_block_channels=third_additional_block_channels, + third_additional_block_kernels=third_additional_block_kernels, + third_additional_block_strides=third_additional_block_strides, + third_additional_block_dilation=third_additional_block_dilation, + third_additional_block_dropout=third_additional_block_dropout, + kernel_regularizer=None, + bias_regularizer=None, + ), + decoder=tf.keras.layers.Conv1D( + filters=vocabulary_size, kernel_size=1, + strides=1, padding="same", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer, + name=f"{self.name}_logits" + ), + vocabulary_size=vocabulary_size, + name=name, + **kwargs + ) + self.time_reduction_factor = self.encoder.first_additional_block.reduction_factor + self.time_reduction_factor *= self.encoder.second_additional_block.reduction_factor + self.time_reduction_factor *= self.encoder.third_additional_block.reduction_factor diff --git a/tensorflow_asr/models/transducer/__init__.py b/tensorflow_asr/models/transducer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/conformer.py b/tensorflow_asr/models/transducer/conformer.py old mode 100755 new mode 100644 similarity index 98% rename from tensorflow_asr/models/conformer.py rename to tensorflow_asr/models/transducer/conformer.py index a13dfa1d19..f66197d972 --- a/tensorflow_asr/models/conformer.py +++ b/tensorflow_asr/models/transducer/conformer.py @@ -14,12 +14,12 @@ import tensorflow as tf -from .activations.glu import GLU +from ..activations.glu import GLU from .transducer import Transducer -from .layers.subsampling import VggSubsampling, Conv2dSubsampling -from .layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat -from .layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention -from ..utils import shape_util +from ..layers.subsampling import VggSubsampling, Conv2dSubsampling +from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat +from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention +from ...utils import shape_util L2 = tf.keras.regularizers.l2(1e-6) diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py similarity index 99% rename from tensorflow_asr/models/contextnet.py rename to tensorflow_asr/models/transducer/contextnet.py index 636560101d..dac9e9050d 100644 --- a/tensorflow_asr/models/contextnet.py +++ b/tensorflow_asr/models/transducer/contextnet.py @@ -16,7 +16,7 @@ from typing import List import tensorflow as tf from .transducer import Transducer -from ..utils import math_util +from ...utils import math_util L2 = tf.keras.regularizers.l2(1e-6) diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/transducer/rnn_transducer.py similarity index 96% rename from tensorflow_asr/models/streaming_transducer.py rename to tensorflow_asr/models/transducer/rnn_transducer.py index ba793126e2..88ef18d80c 100644 --- a/tensorflow_asr/models/streaming_transducer.py +++ b/tensorflow_asr/models/transducer/rnn_transducer.py @@ -15,16 +15,16 @@ import tensorflow as tf -from .layers.subsampling import TimeReduction +from ..layers.subsampling import TimeReduction from .transducer import Transducer -from ..utils import layer_util, math_util, shape_util +from ...utils import layer_util, math_util, shape_util class Reshape(tf.keras.layers.Layer): def call(self, inputs): return math_util.merge_two_last_dims(inputs) -class StreamingTransducerBlock(tf.keras.Model): +class RnnTransducerBlock(tf.keras.Model): def __init__(self, reduction_factor: int = 0, dmodel: int = 640, @@ -34,7 +34,7 @@ def __init__(self, kernel_regularizer=None, bias_regularizer=None, **kwargs): - super(StreamingTransducerBlock, self).__init__(**kwargs) + super().__init__(**kwargs) if reduction_factor > 0: self.reduction = TimeReduction(reduction_factor, name=f"{self.name}_reduction") @@ -94,7 +94,7 @@ def get_config(self): return conf -class StreamingTransducerEncoder(tf.keras.Model): +class RnnTransducerEncoder(tf.keras.Model): def __init__(self, reductions: dict = {0: 3, 1: 2}, dmodel: int = 640, @@ -105,12 +105,12 @@ def __init__(self, kernel_regularizer=None, bias_regularizer=None, **kwargs): - super(StreamingTransducerEncoder, self).__init__(**kwargs) + super().__init__(**kwargs) self.reshape = Reshape(name=f"{self.name}_reshape") self.blocks = [ - StreamingTransducerBlock( + RnnTransducerBlock( reduction_factor=reductions.get(i, 0), # key is index, value is the factor dmodel=dmodel, rnn_type=rnn_type, @@ -174,7 +174,7 @@ def get_config(self): return conf -class StreamingTransducer(Transducer): +class RnnTransducer(Transducer): def __init__(self, vocabulary_size: int, encoder_reductions: dict = {0: 3, 1: 2}, @@ -200,10 +200,10 @@ def __init__(self, joint_trainable: bool = True, kernel_regularizer = None, bias_regularizer = None, - name = "StreamingTransducer", + name = "RnnTransducer", **kwargs): - super(StreamingTransducer, self).__init__( - encoder=StreamingTransducerEncoder( + super().__init__( + encoder=RnnTransducerEncoder( reductions=encoder_reductions, dmodel=encoder_dmodel, nlayers=encoder_nlayers, diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer/transducer.py old mode 100755 new mode 100644 similarity index 94% rename from tensorflow_asr/models/transducer.py rename to tensorflow_asr/models/transducer/transducer.py index efd3c4d55e..8917bf5a3b --- a/tensorflow_asr/models/transducer.py +++ b/tensorflow_asr/models/transducer/transducer.py @@ -15,14 +15,13 @@ import collections import tensorflow as tf -from tensorflow.keras import mixed_precision as mxp -from . import Model -from ..utils import math_util, layer_util, shape_util -from ..featurizers.speech_featurizers import SpeechFeaturizer -from ..featurizers.text_featurizers import TextFeaturizer -from .layers.embedding import Embedding -from ..losses.keras.rnnt_losses import RnntLoss +from ..base_model import BaseModel +from ...utils import math_util, layer_util, shape_util, data_util +from ...featurizers.speech_featurizers import SpeechFeaturizer +from ...featurizers.text_featurizers import TextFeaturizer +from ..layers.embedding import Embedding +from ...losses.rnnt_loss import RnntLoss Hypothesis = collections.namedtuple("Hypothesis", ("index", "prediction", "states")) @@ -44,7 +43,7 @@ def __init__(self, bias_regularizer=None, name="transducer_prediction", **kwargs): - super(TransducerPrediction, self).__init__(name=name, **kwargs) + super().__init__(name=name, **kwargs) self.embed = Embedding(vocabulary_size, embed_dim, regularizer=kernel_regularizer, name=f"{name}_embedding") self.do = tf.keras.layers.Dropout(embed_dropout, name=f"{name}_dropout") @@ -148,7 +147,7 @@ def __init__(self, axis: int = 1, name="transducer_joint_reshape", **kwargs): - super(TransducerJointReshape, self).__init__(name=name, trainable=False, **kwargs) + super().__init__(name=name, trainable=False, **kwargs) self.axis = axis def call(self, inputs, repeats=None, **kwargs): @@ -173,7 +172,7 @@ def __init__(self, bias_regularizer=None, name="tranducer_joint", **kwargs): - super(TransducerJoint, self).__init__(name=name, **kwargs) + super().__init__(name=name, **kwargs) activation = activation.lower() if activation == "linear": @@ -248,7 +247,7 @@ def get_config(self): return conf -class Transducer(Model): +class Transducer(BaseModel): """ Transducer Model Warper """ def __init__(self, @@ -273,7 +272,7 @@ def __init__(self, bias_regularizer=None, name="transducer", **kwargs): - super(Transducer, self).__init__(name=name, **kwargs) + super().__init__(name=name, **kwargs) self.encoder = encoder self.predict_net = TransducerPrediction( vocabulary_size=vocabulary_size, @@ -304,21 +303,20 @@ def __init__(self, ) self.time_reduction_factor = 1 - @property - def metrics(self): - return [self.loss_metric] - def _build(self, input_shape, prediction_shape=[None], batch_size=None): inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32) - input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) - pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32) - pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) - self({ - "input": inputs, - "input_length": input_length, - "prediction": pred, - "prediction_length": pred_length - }, training=False) + inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) + predictions = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32) + predictions_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) + self( + data_util.create_inputs( + inputs=inputs, + inputs_length=inputs_length, + predictions=predictions, + predictions_length=predictions_length + ), + training=False + ) def summary(self, line_length=None, **kwargs): if self.encoder is not None: self.encoder.summary(line_length=line_length, **kwargs) @@ -339,27 +337,26 @@ def add_featurizers(self, self.speech_featurizer = speech_featurizer self.text_featurizer = text_featurizer - def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): + def compile(self, + optimizer, + global_batch_size, + blank=0, + run_eagerly=None, + **kwargs): loss = RnntLoss(blank=blank, global_batch_size=global_batch_size) - self.use_loss_scale = use_loss_scale - if self.use_loss_scale: - optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") - self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32) - super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) + super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs) def call(self, inputs, training=False, **kwargs): - features = inputs["input"] - prediction = inputs["prediction"] - prediction_length = inputs["prediction_length"] - enc = self.encoder(features, training=training, **kwargs) - pred = self.predict_net([prediction, prediction_length], training=training, **kwargs) - outputs = self.joint_net([enc, pred], training=training, **kwargs) - return { - "logit": outputs, - "logit_length": math_util.get_reduced_length(inputs["input_length"], self.time_reduction_factor) - } - - # -------------------------------- INFERENCES------------------------------------- + inputs, inputs_length, predictions, predictions_length = inputs.values() + enc = self.encoder(inputs, training=training, **kwargs) + pred = self.predict_net([predictions, predictions_length], training=training, **kwargs) + logits = self.joint_net([enc, pred], training=training, **kwargs) + return data_util.create_logits( + logits=logits, + logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor) + ) + + # -------------------------------- INFERENCES ------------------------------------- def encoder_inference(self, features: tf.Tensor): """Infer function for encoder (or encoders) diff --git a/tensorflow_asr/optimizers/schedules.py b/tensorflow_asr/optimizers/schedules.py index 1edd8003e9..ec8d151774 100755 --- a/tensorflow_asr/optimizers/schedules.py +++ b/tensorflow_asr/optimizers/schedules.py @@ -103,13 +103,14 @@ class CyclicTransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedu step_size: number of training iterations per half cycle. Authors suggest setting step_size 2-8 x training iterations in epoch. - + It is inspired from the paper: # References - [Cyclical Learning Rates for Training Neural Networks]( https://arxiv.org/abs/1506.01186) """ - def __init__(self, d_model, warmup_steps=4000, max_lr=None, + + def __init__(self, d_model, warmup_steps=4000, max_lr=None, step_size=None): """Applies triangular cyclic to the square root decay learning rate. Args: @@ -134,7 +135,7 @@ def __call__(self, step): cycle = tf.math.floor(1 + step / (2 * self.step_size)) x = tf.math.abs(step / self.step_size - 2 * cycle + 1) lr = lr * (0.5 + tf.math.maximum(0., x)) - lr = tf.math.minimum(self.max_lr, + lr = tf.math.minimum(self.max_lr, tf.math.minimum(lr, warmup)) return lr @@ -145,4 +146,3 @@ def get_config(self): "max_lr": self.max_lr, "step_size": self.step_size } - \ No newline at end of file diff --git a/tensorflow_asr/utils/data_util.py b/tensorflow_asr/utils/data_util.py new file mode 100644 index 0000000000..324c720d49 --- /dev/null +++ b/tensorflow_asr/utils/data_util.py @@ -0,0 +1,43 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tf.data.Dataset does not work well for namedtuple so we are using dict + +import tensorflow as tf + + +def create_inputs(inputs: tf.Tensor, + inputs_length: tf.Tensor, + predictions: tf.Tensor = None, + predictions_length: tf.Tensor = None) -> dict: + return { + "inputs": inputs, + "inputs_length": inputs_length, + "predictions": predictions, + "predictions_length": predictions_length + } + + +def create_logits(logits: tf.Tensor, logits_length: tf.Tensor) -> dict: + return { + "logits": logits, + "logits_length": logits_length + } + + +def create_labels(labels: tf.Tensor, labels_length: tf.Tensor) -> dict: + return { + "labels": labels, + "labels_length": labels_length, + } diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py index c9d1c867d0..0d69315c87 100644 --- a/tensorflow_asr/utils/file_util.py +++ b/tensorflow_asr/utils/file_util.py @@ -14,6 +14,7 @@ import os import re +import tempfile from typing import Union, List import tensorflow as tf @@ -55,3 +56,23 @@ def read_bytes(path: str) -> tf.Tensor: with tf.io.gfile.GFile(path, "rb") as f: content = f.read() return tf.convert_to_tensor(content, dtype=tf.string) + + +def save_file(filepath): + if is_cloud_path(filepath) and is_hdf5_filepath(filepath): + _, ext = os.path.splitext(filepath) + with tempfile.NamedTemporaryFile(suffix=ext) as tmp: + yield tmp.name + tf.io.gfile.copy(tmp.name, filepath, overwrite=True) + else: + yield filepath + + +def read_file(filepath): + if is_cloud_path(filepath) and is_hdf5_filepath(filepath): + _, ext = os.path.splitext(filepath) + with tempfile.NamedTemporaryFile(suffix=ext) as tmp: + tf.io.gfile.copy(filepath, tmp.name, overwrite=True) + yield tmp.name + else: + yield filepath From 65f222d31005fdc5a45e22e0ee847f419c92849c Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Tue, 13 Apr 2021 01:07:40 +0700 Subject: [PATCH 03/13] :rocket: refactor featurizers, datasets, augmentations --- tensorflow_asr/augmentations/augmentation.py | 58 ++ tensorflow_asr/augmentations/augments.py | 86 --- .../augmentations/methods/__init__.py | 0 .../methods/base_method.py} | 10 +- .../augmentations/methods/specaugment.py | 75 +++ .../augmentations/signal_augment.py | 101 --- tensorflow_asr/augmentations/spec_augment.py | 203 ------ tensorflow_asr/configs/config.py | 16 +- tensorflow_asr/datasets/asr_dataset.py | 79 ++- tensorflow_asr/datasets/keras/__init__.py | 16 - tensorflow_asr/datasets/keras/asr_dataset.py | 176 ------ .../featurizers/methods/__init__.py | 0 .../featurizers/{ => methods}/gammatone.py | 0 .../featurizers/speech_featurizers.py | 36 +- .../featurizers/text_featurizers.py | 117 +--- tensorflow_asr/featurizers/wordpiece.py | 577 ------------------ tensorflow_asr/models/keras/conformer.py | 93 --- tensorflow_asr/models/keras/contextnet.py | 181 ------ tensorflow_asr/models/keras/ctc.py | 66 -- tensorflow_asr/models/keras/deepspeech2.py | 86 --- tensorflow_asr/models/keras/jasper.py | 137 ----- .../models/keras/streaming_transducer.py | 201 ------ tensorflow_asr/models/keras/transducer.py | 93 --- tensorflow_asr/runners/README.md | 24 - tensorflow_asr/runners/__init__.py | 42 -- tensorflow_asr/runners/base_runners.py | 498 --------------- tensorflow_asr/runners/ctc_runners.py | 139 ----- tensorflow_asr/runners/transducer_runners.py | 136 ----- 28 files changed, 224 insertions(+), 3022 deletions(-) create mode 100644 tensorflow_asr/augmentations/augmentation.py delete mode 100755 tensorflow_asr/augmentations/augments.py create mode 100644 tensorflow_asr/augmentations/methods/__init__.py rename tensorflow_asr/{models/keras/__init__.py => augmentations/methods/base_method.py} (80%) create mode 100644 tensorflow_asr/augmentations/methods/specaugment.py delete mode 100644 tensorflow_asr/augmentations/signal_augment.py delete mode 100755 tensorflow_asr/augmentations/spec_augment.py delete mode 100644 tensorflow_asr/datasets/keras/__init__.py delete mode 100644 tensorflow_asr/datasets/keras/asr_dataset.py create mode 100644 tensorflow_asr/featurizers/methods/__init__.py rename tensorflow_asr/featurizers/{ => methods}/gammatone.py (100%) delete mode 100644 tensorflow_asr/featurizers/wordpiece.py delete mode 100644 tensorflow_asr/models/keras/conformer.py delete mode 100644 tensorflow_asr/models/keras/contextnet.py delete mode 100644 tensorflow_asr/models/keras/ctc.py delete mode 100644 tensorflow_asr/models/keras/deepspeech2.py delete mode 100644 tensorflow_asr/models/keras/jasper.py delete mode 100644 tensorflow_asr/models/keras/streaming_transducer.py delete mode 100644 tensorflow_asr/models/keras/transducer.py delete mode 100644 tensorflow_asr/runners/README.md delete mode 100644 tensorflow_asr/runners/__init__.py delete mode 100644 tensorflow_asr/runners/base_runners.py delete mode 100644 tensorflow_asr/runners/ctc_runners.py delete mode 100644 tensorflow_asr/runners/transducer_runners.py diff --git a/tensorflow_asr/augmentations/augmentation.py b/tensorflow_asr/augmentations/augmentation.py new file mode 100644 index 0000000000..314a6488b6 --- /dev/null +++ b/tensorflow_asr/augmentations/augmentation.py @@ -0,0 +1,58 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + +from .methods import specaugment + + +AUGMENTATIONS = { + "freq_masking": specaugment.FreqMasking, + "time_masking": specaugment.TimeMasking, +} + + +class Augmentation: + def __init__(self, config: dict = None): + if not config: config = {} + self.prob = float(config.pop("prob", 0.5)) + self.before = self.parse(config.pop("before", {})) + self.after = self.parse(config.pop("after", {})) + + def _augment(self, inputs, augmentations): + outputs = inputs + for au in augmentations: + p = tf.random.uniform([]) + outputs = tf.where(tf.less(p, self.prob), au.augment(outputs), outputs) + return outputs + + @tf.function + def signal_augment(self, inputs): + return self._augment(inputs, self.before) + + @tf.function + def feature_augment(self, inputs): + return self._augment(inputs, self.after) + + @staticmethod + def parse(config: dict) -> list: + augmentations = [] + for key, value in config.items(): + au = AUGMENTATIONS.get(key, None) + if au is None: + raise KeyError(f"No tf augmentation named: {key}\n" + f"Available tf augmentations: {AUGMENTATIONS.keys()}") + aug = au(**value) if value is not None else au() + augmentations.append(aug) + return augmentations diff --git a/tensorflow_asr/augmentations/augments.py b/tensorflow_asr/augmentations/augments.py deleted file mode 100755 index 24a59841de..0000000000 --- a/tensorflow_asr/augmentations/augments.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -import nlpaug.flow as naf - -from .signal_augment import SignalCropping, SignalLoudness, SignalMask, SignalNoise, \ - SignalPitch, SignalShift, SignalSpeed, SignalVtlp -from .spec_augment import FreqMasking, TimeMasking, TFFreqMasking, TFTimeMasking - - -AUGMENTATIONS = { - "freq_masking": FreqMasking, - "time_masking": TimeMasking, - "noise": SignalNoise, - "masking": SignalMask, - "cropping": SignalCropping, - "loudness": SignalLoudness, - "pitch": SignalPitch, - "shift": SignalShift, - "speed": SignalSpeed, - "vtlp": SignalVtlp -} - -TFAUGMENTATIONS = { - "freq_masking": TFFreqMasking, - "time_masking": TFTimeMasking, -} - - -class TFAugmentationExecutor: - def __init__(self, augmentations: list, prob: float = 0.5): - self.augmentations = augmentations - self.prob = prob - - @tf.function - def augment(self, inputs): - outputs = inputs - for au in self.augmentations: - p = tf.random.uniform([]) - outputs = tf.where(tf.less(p, self.prob), au.augment(outputs), outputs) - return outputs - - -class Augmentation: - def __init__(self, config: dict = None, use_tf: bool = False): - if not config: config = {} - prob = float(config.pop("prob", 0.5)) - parser = self.tf_parse if use_tf else self.parse - self.before = parser(config.pop("before", {}), prob=prob) - self.after = parser(config.pop("after", {}), prob=prob) - - @staticmethod - def parse(config: dict, prob: float = 0.5) -> naf.Sometimes: - augmentations = [] - for key, value in config.items(): - au = AUGMENTATIONS.get(key, None) - if au is None: - raise KeyError(f"No augmentation named: {key}\n" - f"Available augmentations: {AUGMENTATIONS.keys()}") - aug = au(**value) if value is not None else au() - augmentations.append(aug) - return naf.Sometimes(augmentations, pipeline_p=prob) - - @staticmethod - def tf_parse(config: dict, prob: float = 0.5) -> TFAugmentationExecutor: - augmentations = [] - for key, value in config.items(): - au = TFAUGMENTATIONS.get(key, None) - if au is None: - raise KeyError(f"No tf augmentation named: {key}\n" - f"Available tf augmentations: {TFAUGMENTATIONS.keys()}") - aug = au(**value) if value is not None else au() - augmentations.append(aug) - return TFAugmentationExecutor(augmentations, prob=prob) diff --git a/tensorflow_asr/augmentations/methods/__init__.py b/tensorflow_asr/augmentations/methods/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/keras/__init__.py b/tensorflow_asr/augmentations/methods/base_method.py similarity index 80% rename from tensorflow_asr/models/keras/__init__.py rename to tensorflow_asr/augmentations/methods/base_method.py index c494840752..6cc9c0f759 100644 --- a/tensorflow_asr/models/keras/__init__.py +++ b/tensorflow_asr/augmentations/methods/base_method.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .transducer import Transducer -from .conformer import Conformer -__all__ = ['Transducer', 'Conformer'] +import tensorflow as tf + + +class AugmentationMethod: + @tf.function + def augment(self, *args, **kwargs): + raise NotImplementedError() diff --git a/tensorflow_asr/augmentations/methods/specaugment.py b/tensorflow_asr/augmentations/methods/specaugment.py new file mode 100644 index 0000000000..b948b6644e --- /dev/null +++ b/tensorflow_asr/augmentations/methods/specaugment.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + +from ...utils import shape_util +from .base_method import AugmentationMethod + + +class FreqMasking(AugmentationMethod): + def __init__(self, num_masks: int = 1, mask_factor: float = 27): + self.num_masks = num_masks + self.mask_factor = mask_factor + + @tf.function + def augment(self, spectrogram: tf.Tensor): + """ + Masking the frequency channels (shape[1]) + Args: + spectrogram: shape (T, num_feature_bins, V) + Returns: + frequency masked spectrogram + """ + T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32) + for _ in range(self.num_masks): + f = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) + f = tf.minimum(f, F) + f0 = tf.random.uniform([], minval=0, maxval=(F - f), dtype=tf.int32) + mask = tf.concat([ + tf.ones([T, f0, V], dtype=spectrogram.dtype), + tf.zeros([T, f, V], dtype=spectrogram.dtype), + tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype) + ], axis=1) + spectrogram = spectrogram * mask + return spectrogram + + +class TimeMasking(AugmentationMethod): + def __init__(self, num_masks: int = 1, mask_factor: float = 100, p_upperbound: float = 1.0): + self.num_masks = num_masks + self.mask_factor = mask_factor + self.p_upperbound = p_upperbound + + @tf.function + def augment(self, spectrogram: tf.Tensor): + """ + Masking the time channel (shape[0]) + Args: + spectrogram: shape (T, num_feature_bins, V) + Returns: + frequency masked spectrogram + """ + T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32) + for _ in range(self.num_masks): + t = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) + t = tf.minimum(t, tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound, dtype=tf.int32)) + t0 = tf.random.uniform([], minval=0, maxval=(T - t), dtype=tf.int32) + mask = tf.concat([ + tf.ones([t0, F, V], dtype=spectrogram.dtype), + tf.zeros([t, F, V], dtype=spectrogram.dtype), + tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype) + ], axis=0) + spectrogram = spectrogram * mask + return spectrogram diff --git a/tensorflow_asr/augmentations/signal_augment.py b/tensorflow_asr/augmentations/signal_augment.py deleted file mode 100644 index c0b2444b2e..0000000000 --- a/tensorflow_asr/augmentations/signal_augment.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import glob -import librosa -import nlpaug.augmenter.audio as naa - - -class SignalCropping(naa.CropAug): - def __init__(self, - zone=(0.2, 0.8), - coverage=0.1, - crop_range=(0.2, 0.8), - crop_factor=2): - super(SignalCropping, self).__init__(sampling_rate=None, zone=zone, coverage=coverage, - crop_range=crop_range, crop_factor=crop_factor, - duration=None) - - -class SignalLoudness(naa.LoudnessAug): - def __init__(self, - zone=(0.2, 0.8), - coverage=1., - factor=(0.5, 2)): - super(SignalLoudness, self).__init__(zone=zone, coverage=coverage, factor=factor) - - -class SignalMask(naa.MaskAug): - def __init__(self, - zone=(0.2, 0.8), - coverage=1., - mask_range=(0.2, 0.8), - mask_factor=2, - mask_with_noise=True): - super(SignalMask, self).__init__(sampling_rate=None, zone=zone, coverage=coverage, - duration=None, mask_range=mask_range, - mask_factor=mask_factor, - mask_with_noise=mask_with_noise) - - -class SignalNoise(naa.NoiseAug): - def __init__(self, - sample_rate=16000, - zone=(0.2, 0.8), - coverage=1., - color="random", - noises: str = None): - if noises is not None: - noises = glob.glob(os.path.join(noises, "**", "*.wav"), recursive=True) - noises = [librosa.load(n, sr=sample_rate)[0] for n in noises] - super(SignalNoise, self).__init__(zone=zone, coverage=coverage, - color=color, noises=noises) - - -class SignalPitch(naa.PitchAug): - def __init__(self, - zone=(0.2, 0.8), - coverage=1., - factor=(-10, 10)): - super(SignalPitch, self).__init__(None, zone=zone, coverage=coverage, - duration=None, factor=factor) - - -class SignalShift(naa.ShiftAug): - def __init__(self, - sample_rate=16000, - duration=3, - direction="random"): - super(SignalShift, self).__init__(sample_rate, duration=duration, direction=direction) - - -class SignalSpeed(naa.SpeedAug): - def __init__(self, - zone=(0.2, 0.8), - coverage=1., - factor=(0.5, 2)): - super(SignalSpeed, self).__init__(zone=zone, coverage=coverage, - duration=None, factor=factor) - - -class SignalVtlp(naa.VtlpAug): - def __init__(self, - sample_rate=16000, - zone=(0.2, 0.8), - coverage=0.1, - fhi=4800, - factor=(0.9, 1.1)): - super(SignalVtlp, self).__init__(sample_rate, zone=zone, coverage=coverage, - duration=None, fhi=fhi, factor=factor) diff --git a/tensorflow_asr/augmentations/spec_augment.py b/tensorflow_asr/augmentations/spec_augment.py deleted file mode 100755 index 9e1f68726d..0000000000 --- a/tensorflow_asr/augmentations/spec_augment.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Augmentation on spectrogram: http://arxiv.org/abs/1904.08779 """ - -import numpy as np -import tensorflow as tf - -from nlpaug.flow import Sequential -from nlpaug.util import Action -from nlpaug.model.spectrogram import Spectrogram -from nlpaug.augmenter.spectrogram import SpectrogramAugmenter - -from ..utils.utils import shape_list - -# ---------------------------- FREQ MASKING ---------------------------- - - -class FreqMaskingModel(Spectrogram): - def __init__(self, mask_factor: int = 27): - """ - Args: - freq_mask_param: parameter F of frequency masking - """ - super(FreqMaskingModel, self).__init__() - self.mask_factor = mask_factor - - def mask(self, data: np.ndarray) -> np.ndarray: - """ - Masking the frequency channels (make features on some channel 0) - Args: - spectrogram: shape (T, num_feature_bins, V) - Returns: - frequency masked spectrogram - """ - spectrogram = data.copy() - freq = np.random.randint(0, self.mask_factor + 1) - freq = min(freq, spectrogram.shape[1]) - freq0 = np.random.randint(0, spectrogram.shape[1] - freq + 1) - spectrogram[:, freq0:freq0 + freq, :] = 0 # masking - return spectrogram - - -class FreqMaskingAugmenter(SpectrogramAugmenter): - def __init__(self, - mask_factor: float = 27, - name: str = "FreqMaskingAugmenter", - verbose=0): - super(FreqMaskingAugmenter, self).__init__( - action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose, - coverage=1., factor=(40, 80), silence=False, stateless=True) - self.model = FreqMaskingModel(mask_factor) - - def substitute(self, data): - return self.model.mask(data) - - -class FreqMasking(SpectrogramAugmenter): - def __init__(self, - num_masks: int = 1, - mask_factor: float = 27, - name: str = "FreqMasking", - verbose=0): - super(FreqMasking, self).__init__( - action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose, - coverage=1., factor=(40, 80), silence=False, stateless=True) - self.flow = Sequential([FreqMaskingAugmenter(mask_factor) for _ in range(num_masks)]) - - def substitute(self, data): - return self.flow.augment(data) - - -class TFFreqMasking: - def __init__(self, num_masks: int = 1, mask_factor: float = 27): - self.num_masks = num_masks - self.mask_factor = mask_factor - - @tf.function - def augment(self, spectrogram: tf.Tensor): - """ - Masking the frequency channels (shape[1]) - Args: - spectrogram: shape (T, num_feature_bins, V) - Returns: - frequency masked spectrogram - """ - T, F, V = shape_list(spectrogram, out_type=tf.int32) - for _ in range(self.num_masks): - f = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) - f = tf.minimum(f, F) - f0 = tf.random.uniform([], minval=0, maxval=(F - f), dtype=tf.int32) - mask = tf.concat([ - tf.ones([T, f0, V], dtype=spectrogram.dtype), - tf.zeros([T, f, V], dtype=spectrogram.dtype), - tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype) - ], axis=1) - spectrogram = spectrogram * mask - return spectrogram - - -# ---------------------------- TIME MASKING ---------------------------- - - -class TimeMaskingModel(Spectrogram): - def __init__(self, mask_factor: float = 100, p_upperbound: float = 1.0): - """ - Args: - time_mask_param: parameter W of time masking - p_upperbound: an upperbound so that the number of masked time - steps must not exceed p_upperbound * total_time_steps - """ - super(TimeMaskingModel, self).__init__() - self.mask_factor = mask_factor - self.p_upperbound = p_upperbound - assert 0.0 <= self.p_upperbound <= 1.0, "0.0 <= p_upperbound <= 1.0" - - def mask(self, data: np.ndarray) -> np.ndarray: - """ - Masking the time steps (make features on some time steps 0) - Args: - spectrogram: shape (T, num_feature_bins, V) - Returns: - a tensor that's applied time masking - """ - spectrogram = data.copy() - time = np.random.randint(0, self.mask_factor + 1) - time = min(time, int(self.p_upperbound * spectrogram.shape[0])) - time0 = np.random.randint(0, spectrogram.shape[0] - time + 1) - spectrogram[time0:time0 + time, :, :] = 0 - return spectrogram - - -class TimeMaskingAugmenter(SpectrogramAugmenter): - def __init__(self, - mask_factor: float = 100, - p_upperbound: float = 1, - name: str = "TimeMaskingAugmenter", - verbose=0): - super(TimeMaskingAugmenter, self).__init__( - action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose, - coverage=1., silence=False, stateless=True) - self.model = TimeMaskingModel(mask_factor, p_upperbound) - - def substitute(self, data): - return self.model.mask(data) - - -class TimeMasking(SpectrogramAugmenter): - def __init__(self, - num_masks: int = 1, - mask_factor: float = 100, - p_upperbound: float = 1, - name: str = "TimeMasking", - verbose=0): - super(TimeMasking, self).__init__( - action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose, - coverage=1., silence=False, stateless=True) - self.flow = Sequential([ - TimeMaskingAugmenter(mask_factor, p_upperbound) for _ in range(num_masks) - ]) - - def substitute(self, data): - return self.flow.augment(data) - - -class TFTimeMasking: - def __init__(self, num_masks: int = 1, mask_factor: float = 100, p_upperbound: float = 1.0): - self.num_masks = num_masks - self.mask_factor = mask_factor - self.p_upperbound = p_upperbound - - @tf.function - def augment(self, spectrogram: tf.Tensor): - """ - Masking the time channel (shape[0]) - Args: - spectrogram: shape (T, num_feature_bins, V) - Returns: - frequency masked spectrogram - """ - T, F, V = shape_list(spectrogram, out_type=tf.int32) - for _ in range(self.num_masks): - t = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) - t = tf.minimum(t, tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound, dtype=tf.int32)) - t0 = tf.random.uniform([], minval=0, maxval=(T - t), dtype=tf.int32) - mask = tf.concat([ - tf.ones([t0, F, V], dtype=spectrogram.dtype), - tf.zeros([t, F, V], dtype=spectrogram.dtype), - tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype) - ], axis=0) - spectrogram = spectrogram * mask - return spectrogram diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py index 7c3dcf6e5d..da79ddd1f0 100644 --- a/tensorflow_asr/configs/config.py +++ b/tensorflow_asr/configs/config.py @@ -14,7 +14,7 @@ from . import load_yaml from ..augmentations.augments import Augmentation -from ..utils.utils import preprocess_paths +from ..utils import file_util class DecoderConfig: @@ -25,12 +25,12 @@ def __init__(self, config: dict = None): self.norm_score = config.pop("norm_score", True) self.lm_config = config.pop("lm_config", {}) - self.vocabulary = preprocess_paths(config.pop("vocabulary", None)) + self.vocabulary = file_util.preprocess_paths(config.pop("vocabulary", None)) self.target_vocab_size = config.pop("target_vocab_size", 1024) self.max_subword_length = config.pop("max_subword_length", 4) - self.output_path_prefix = preprocess_paths(config.pop("output_path_prefix", None)) + self.output_path_prefix = file_util.preprocess_paths(config.pop("output_path_prefix", None)) self.model_type = config.pop("model_type", None) - self.corpus_files = preprocess_paths(config.pop("corpus_files", [])) + self.corpus_files = file_util.preprocess_paths(config.pop("corpus_files", [])) self.max_corpus_chars = config.pop("max_corpus_chars", None) self.reserved_tokens = config.pop("reserved_tokens", None) @@ -41,8 +41,8 @@ class DatasetConfig: def __init__(self, config: dict = None): if not config: config = {} self.stage = config.pop("stage", None) - self.data_paths = preprocess_paths(config.pop("data_paths", None)) - self.tfrecords_dir = preprocess_paths(config.pop("tfrecords_dir", None)) + self.data_paths = file_util.preprocess_paths(config.pop("data_paths", None)) + self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None)) self.tfrecords_shards = config.pop("tfrecords_shards", 16) self.shuffle = config.pop("shuffle", False) self.cache = config.pop("cache", False) @@ -59,7 +59,7 @@ def __init__(self, config: dict = None): self.batch_size = config.pop("batch_size", 1) self.accumulation_steps = config.pop("accumulation_steps", 1) self.num_epochs = config.pop("num_epochs", 20) - self.outdir = preprocess_paths(config.pop("outdir", None)) + self.outdir = file_util.preprocess_paths(config.pop("outdir", None)) self.log_interval_steps = config.pop("log_interval_steps", 500) self.save_interval_steps = config.pop("save_interval_steps", 500) self.eval_interval_steps = config.pop("eval_interval_steps", 1000) @@ -81,7 +81,7 @@ class Config: """ User config class for training, testing or infering """ def __init__(self, path: str): - config = load_yaml(preprocess_paths(path)) + config = load_yaml(file_util.preprocess_paths(path)) self.speech_config = config.pop("speech_config", {}) self.decoder_config = config.pop("decoder_config", {}) self.model_config = config.pop("model_config", {}) diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py index a8d8045680..f2d08de6ae 100755 --- a/tensorflow_asr/datasets/asr_dataset.py +++ b/tensorflow_asr/datasets/asr_dataset.py @@ -18,11 +18,11 @@ import numpy as np import tensorflow as tf -from ..augmentations.augments import Augmentation +from ..augmentations.augmentation import Augmentation from .base_dataset import BaseDataset, BUFFER_SIZE, TFRECORD_SHARDS, AUTOTUNE from ..featurizers.speech_featurizers import load_and_convert_to_wav, read_raw_audio, tf_read_raw_audio, SpeechFeaturizer from ..featurizers.text_featurizers import TextFeaturizer -from ..utils.utils import bytestring_feature, get_num_batches, preprocess_paths +from ..utils import feature_util, file_util, math_util, data_util class ASRDataset(BaseDataset): @@ -62,7 +62,7 @@ def compute_metadata(self): def save_metadata(self, metadata_prefix: str = None): if metadata_prefix is None: return - metadata_path = preprocess_paths(metadata_prefix) + ".metadata.json" + metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json" if tf.io.gfile.exists(metadata_path): with tf.io.gfile.GFile(metadata_path, "r") as f: content = json.loads(f.read()) @@ -79,7 +79,7 @@ def save_metadata(self, metadata_prefix: str = None): def load_metadata(self, metadata_prefix: str = None): if metadata_prefix is None: return - metadata_path = preprocess_paths(metadata_prefix) + ".metadata.json" + metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json" if tf.io.gfile.exists(metadata_path): print(f"Loading metadata from {metadata_path} ...") with tf.io.gfile.GFile(metadata_path, "r") as f: @@ -124,11 +124,11 @@ def preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): def fn(_path: bytes, _audio: bytes, _indices: bytes): signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate) - signal = self.augmentations.before.augment(signal) + signal = self.augmentations.signal_augment(signal) - features = self.speech_featurizer.extract(signal) + features = self.speech_featurizer.extract(signal.numpy()) - features = self.augmentations.after.augment(features) + features = self.augmentations.feature_augment(features) label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32) label_length = tf.cast(tf.shape(label)[0], tf.int32) @@ -148,11 +148,11 @@ def tf_preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): with tf.device("/CPU:0"): signal = tf_read_raw_audio(audio, self.speech_featurizer.sample_rate) - signal = self.augmentations.before.augment(signal) + signal = self.augmentations.signal_augment(signal) features = self.speech_featurizer.tf_extract(signal) - features = self.augmentations.after.augment(features) + features = self.augmentations.feature_augment(features) label = tf.strings.to_number(tf.strings.split(indices), out_type=tf.int32) label_length = tf.cast(tf.shape(label)[0], tf.int32) @@ -168,12 +168,27 @@ def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): Returns: path, features, input_lengths, labels, label_lengths, pred_inp """ - if self.use_tf: return self.tf_preprocess(path, audio, indices) - return self.preprocess(path, audio, indices) + if self.use_tf: data = self.tf_preprocess(path, audio, indices) + else: data = self.preprocess(path, audio, indices) + + _, features, input_length, label, label_length, prediction, prediction_length = data + + return ( + data_util.create_inputs( + inputs=features, + inputs_length=input_length, + predictions=prediction, + predictions_length=prediction_length + ), + data_util.create_labels( + labels=label, + labels_length=label_length + ) + ) # -------------------------------- CREATION ------------------------------------- - def process(self, dataset: tf.data.Dataset, batch_size: int): + def process(self, dataset, batch_size): dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE) if self.cache: @@ -189,21 +204,35 @@ def process(self, dataset: tf.data.Dataset, batch_size: int): dataset = dataset.padded_batch( batch_size=batch_size, padded_shapes=( - tf.TensorShape([]), - tf.TensorShape(self.speech_featurizer.shape), - tf.TensorShape([]), - tf.TensorShape(self.text_featurizer.shape), - tf.TensorShape([]), - tf.TensorShape(self.text_featurizer.prepand_shape), - tf.TensorShape([]), + data_util.create_inputs( + inputs=tf.TensorShape(self.speech_featurizer.shape), + inputs_length=tf.TensorShape([]), + predictions=tf.TensorShape(self.text_featurizer.prepand_shape), + predictions_length=tf.TensorShape([]) + ), + data_util.create_labels( + labels=tf.TensorShape(self.text_featurizer.shape), + labels_length=tf.TensorShape([]) + ), + ), + padding_values=( + data_util.create_inputs( + inputs= 0., + inputs_length=0, + predictions=self.text_featurizer.blank, + predictions_length=0 + ), + data_util.create_labels( + labels=self.text_featurizer.blank, + labels_length=0 + ) ), - padding_values=(None, 0., 0, self.text_featurizer.blank, 0, self.text_featurizer.blank, 0), - drop_remainder=self.drop_remainder + drop_remainder = self.drop_remainder ) # PREFETCH to improve speed of input length dataset = dataset.prefetch(AUTOTUNE) - self.total_steps = get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder) + self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder) return dataset def create(self, batch_size: int): @@ -254,9 +283,9 @@ def parse(record): def fn(path, indices): audio = load_and_convert_to_wav(path.decode("utf-8")).numpy() feature = { - "path": bytestring_feature([path]), - "audio": bytestring_feature([audio]), - "indices": bytestring_feature([indices]) + "path": feature_util.bytestring_feature([path]), + "audio": feature_util.bytestring_feature([audio]), + "indices": feature_util.bytestring_feature([indices]) } example = tf.train.Example(features=tf.train.Features(feature=feature)) return example.SerializeToString() diff --git a/tensorflow_asr/datasets/keras/__init__.py b/tensorflow_asr/datasets/keras/__init__.py deleted file mode 100644 index 5aee10fa36..0000000000 --- a/tensorflow_asr/datasets/keras/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .asr_dataset import ASRDatasetKeras, ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -__all__ = ['ASRDatasetKeras', 'ASRTFRecordDatasetKeras', 'ASRSliceDatasetKeras'] diff --git a/tensorflow_asr/datasets/keras/asr_dataset.py b/tensorflow_asr/datasets/keras/asr_dataset.py deleted file mode 100644 index 448ad4011e..0000000000 --- a/tensorflow_asr/datasets/keras/asr_dataset.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf - -from ..asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset, AUTOTUNE, TFRECORD_SHARDS -from ..base_dataset import BUFFER_SIZE -from ...featurizers.speech_featurizers import SpeechFeaturizer -from ...featurizers.text_featurizers import TextFeaturizer -from ...utils.utils import get_num_batches -from ...augmentations.augments import Augmentation - - -class ASRDatasetKeras(ASRDataset): - """ Keras Dataset for ASR using Generator """ - - def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): - """ - Returns: - path, features, input_lengths, labels, label_lengths, pred_inp - """ - if self.use_tf: data = self.tf_preprocess(path, audio, indices) - else: data = self.preprocess(path, audio, indices) - - _, features, input_length, label, label_length, prediction, prediction_length = data - - return ( - { - "input": features, - "input_length": input_length, - "prediction": prediction, - "prediction_length": prediction_length - }, - { - "label": label, - "label_length": label_length - } - ) - - def process(self, dataset, batch_size): - dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE) - - if self.cache: - dataset = dataset.cache() - - if self.shuffle: - dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True) - - if self.indefinite: - dataset = dataset.repeat() - - # PADDED BATCH the dataset - dataset = dataset.padded_batch( - batch_size=batch_size, - padded_shapes=( - { - "input": tf.TensorShape(self.speech_featurizer.shape), - "input_length": tf.TensorShape([]), - "prediction": tf.TensorShape(self.text_featurizer.prepand_shape), - "prediction_length": tf.TensorShape([]) - }, - { - "label": tf.TensorShape(self.text_featurizer.shape), - "label_length": tf.TensorShape([]) - }, - ), - padding_values=( - { - "input": 0., - "input_length": 0, - "prediction": self.text_featurizer.blank, - "prediction_length": 0 - }, - { - "label": self.text_featurizer.blank, - "label_length": 0 - } - ), - drop_remainder=self.drop_remainder - ) - - # PREFETCH to improve speed of input length - dataset = dataset.prefetch(AUTOTUNE) - self.total_steps = get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder) - return dataset - - -class ASRTFRecordDatasetKeras(ASRDatasetKeras, ASRTFRecordDataset): - """ Keras Dataset for ASR using TFRecords """ - - def __init__(self, - data_paths: list, - tfrecords_dir: str, - speech_featurizer: SpeechFeaturizer, - text_featurizer: TextFeaturizer, - stage: str, - augmentations: Augmentation = Augmentation(None), - tfrecords_shards: int = TFRECORD_SHARDS, - cache: bool = False, - shuffle: bool = False, - use_tf: bool = False, - indefinite: bool = False, - drop_remainder: bool = True, - buffer_size: int = BUFFER_SIZE, - **kwargs): - ASRTFRecordDataset.__init__( - self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - data_paths=data_paths, tfrecords_dir=tfrecords_dir, augmentations=augmentations, cache=cache, shuffle=shuffle, - tfrecords_shards=tfrecords_shards, drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf, - indefinite=indefinite - ) - ASRDatasetKeras.__init__( - self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, - drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf, - indefinite=indefinite - ) - - def parse(self, record: tf.Tensor): - feature_description = { - "path": tf.io.FixedLenFeature([], tf.string), - "audio": tf.io.FixedLenFeature([], tf.string), - "indices": tf.io.FixedLenFeature([], tf.string) - } - example = tf.io.parse_single_example(record, feature_description) - return ASRDatasetKeras.parse(self, **example) - - def process(self, dataset: tf.data.Dataset, batch_size: int): - return ASRDatasetKeras.process(self, dataset, batch_size) - - -class ASRSliceDatasetKeras(ASRDatasetKeras, ASRSliceDataset): - """ Keras Dataset for ASR using Slice """ - - def __init__(self, - stage: str, - speech_featurizer: SpeechFeaturizer, - text_featurizer: TextFeaturizer, - data_paths: list, - augmentations: Augmentation = Augmentation(None), - cache: bool = False, - shuffle: bool = False, - use_tf: bool = False, - indefinite: bool = False, - drop_remainder: bool = True, - buffer_size: int = BUFFER_SIZE, - **kwargs): - ASRSliceDataset.__init__( - self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, - drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf, - indefinite=indefinite - ) - ASRDatasetKeras.__init__( - self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle, - drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf, - indefinite=indefinite - ) - - def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): - return ASRDatasetKeras.parse(self, path, audio, indices) - - def process(self, dataset: tf.data.Dataset, batch_size: int): - return ASRDatasetKeras.process(self, dataset, batch_size) diff --git a/tensorflow_asr/featurizers/methods/__init__.py b/tensorflow_asr/featurizers/methods/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/featurizers/gammatone.py b/tensorflow_asr/featurizers/methods/gammatone.py similarity index 100% rename from tensorflow_asr/featurizers/gammatone.py rename to tensorflow_asr/featurizers/methods/gammatone.py diff --git a/tensorflow_asr/featurizers/speech_featurizers.py b/tensorflow_asr/featurizers/speech_featurizers.py index c6905f9696..0cb133d6df 100755 --- a/tensorflow_asr/featurizers/speech_featurizers.py +++ b/tensorflow_asr/featurizers/speech_featurizers.py @@ -23,10 +23,10 @@ import tensorflow as tf import tensorflow_io as tfio -from ..utils.utils import log10, has_tpu -from .gammatone import fft_weights +from ..utils import math_util, env_util +from .methods import gammatone -tpu = has_tpu() +tpu = env_util.has_tpu() # def tf_resample(signal, rate_in, rate_out): @@ -398,16 +398,16 @@ def compute_log_mel_spectrogram(self, signal: np.ndarray) -> np.ndarray: def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray: S = self.stft(signal) - gammatone = fft_weights(self.nfft, self.sample_rate, - self.num_feature_bins, width=1.0, - fmin=0, fmax=int(self.sample_rate / 2), - maxlen=(self.nfft / 2 + 1)) + gtone = gammatone.fft_weights(self.nfft, self.sample_rate, + self.num_feature_bins, width=1.0, + fmin=0, fmax=int(self.sample_rate / 2), + maxlen=(self.nfft / 2 + 1)) - gammatone = gammatone.numpy().astype(np.float32) + gtone = gtone.numpy().astype(np.float32) - gammatone_spectrogram = np.dot(S.T, gammatone) + gtone_spectrogram = np.dot(S.T, gtone) - return self.power_to_db(gammatone_spectrogram) + return self.power_to_db(gtone_spectrogram) class TFSpeechFeaturizer(SpeechFeaturizer): @@ -438,8 +438,8 @@ def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0): else: ref_value = np.abs(ref) - log_spec = 10.0 * log10(tf.maximum(amin, magnitude)) - log_spec -= 10.0 * log10(tf.maximum(amin, ref_value)) + log_spec = 10.0 * math_util.log10(tf.maximum(amin, magnitude)) + log_spec -= 10.0 * math_util.log10(tf.maximum(amin, ref_value)) if top_db is not None: if top_db < 0: @@ -507,11 +507,11 @@ def compute_mfcc(self, signal): def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray: S = self.stft(signal) - gammatone = fft_weights(self.nfft, self.sample_rate, - self.num_feature_bins, width=1.0, - fmin=0, fmax=int(self.sample_rate / 2), - maxlen=(self.nfft / 2 + 1)) + gtone = gammatone.fft_weights(self.nfft, self.sample_rate, + self.num_feature_bins, width=1.0, + fmin=0, fmax=int(self.sample_rate / 2), + maxlen=(self.nfft / 2 + 1)) - gammatone_spectrogram = tf.tensordot(S, gammatone, 1) + gtone_spectrogram = tf.tensordot(S, gtone, 1) - return self.power_to_db(gammatone_spectrogram) + return self.power_to_db(gtone_spectrogram) diff --git a/tensorflow_asr/featurizers/text_featurizers.py b/tensorflow_asr/featurizers/text_featurizers.py index a6b5dfbb8a..074a159429 100755 --- a/tensorflow_asr/featurizers/text_featurizers.py +++ b/tensorflow_asr/featurizers/text_featurizers.py @@ -23,8 +23,7 @@ import tensorflow_datasets as tds from ..configs.config import DecoderConfig -from ..utils.utils import preprocess_paths -from . import wordpiece +from ..utils import file_util ENGLISH_CHARACTERS = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] @@ -251,13 +250,13 @@ def corpus_generator(): @classmethod def load_from_file(cls, decoder_config: dict, filename: str = None): dconf = DecoderConfig(decoder_config.copy()) - filename = dconf.vocabulary if filename is None else preprocess_paths(filename) + filename = dconf.vocabulary if filename is None else file_util.preprocess_paths(filename) filename_prefix = os.path.splitext(filename)[0] subwords = tds.deprecated.text.SubwordTextEncoder.load_from_file(filename_prefix) return cls(decoder_config, subwords) def save_to_file(self, filename: str = None): - filename = self.decoder_config.vocabulary if filename is None else preprocess_paths(filename) + filename = self.decoder_config.vocabulary if filename is None else file_util.preprocess_paths(filename) filename_prefix = os.path.splitext(filename)[0] return self.subwords.save_to_file(filename_prefix) @@ -325,114 +324,6 @@ def indices2upoints(self, indices: tf.Tensor) -> tf.Tensor: return tf.gather_nd(upoints, tf.where(tf.not_equal(upoints, 0))) -class TFSubwordFeaturizer(TextFeaturizer): - """ - Extract text feature based on char-level granularity. - By looking up the vocabulary table, each line of transcript will be - converted to a sequence of integer indexes. - """ - - def __init__(self, decoder_config: dict, subwords=None): - """ - decoder_config = { - "target_vocab_size": int, - "max_subword_length": 4, - "max_corpus_chars": None, - "reserved_tokens": None, - "beam_width": int, - "lm_config": { - ... - } - } - """ - super(TFSubwordFeaturizer, self).__init__(decoder_config) - self.subwords = self.__load_subwords() if subwords is None else subwords - self.blank = 0 # subword treats blank as 0 - self.num_classes = self.subwords.vocab_size - - def __load_subwords(self): - return wordpiece.WordpieceTokenizer(self.decoder_config.vocabulary, token_out_type=tf.int32) - - @classmethod - def build_from_corpus(cls, decoder_config: dict, corpus_files: list = None, output_file: str = None): - dconf = DecoderConfig(decoder_config.copy()) - corpus_files = dconf.corpus_files if corpus_files is None or len(corpus_files) == 0 else corpus_files - filename = dconf.vocabulary if output_file is None else preprocess_paths(output_file) - - def corpus_generator(): - for file in corpus_files: - with open(file, "r", encoding="utf-8") as f: - lines = f.read().splitlines() - lines = lines[1:] - for line in lines: - line = line.split("\t") - yield line[-1] - - wordpiece.build_from_corpus( - corpus_generator(), - output_file_path=filename, - target_vocab_size=dconf.target_vocab_size, - max_subword_length=dconf.max_subword_length, - max_corpus_chars=dconf.max_corpus_chars, - reserved_tokens=dconf.reserved_tokens - ) - - subwords = wordpiece.WordpieceTokenizer(filename, token_out_type=tf.int32) - return cls(decoder_config, subwords) - - @classmethod - def load_from_file(cls, decoder_config: dict, filename: str = None): - dconf = DecoderConfig(decoder_config.copy()) - filename = dconf.vocabulary if filename is None else preprocess_paths(filename) - subwords = wordpiece.WordpieceTokenizer(filename, token_out_type=tf.int32) - return cls(decoder_config, subwords) - - def extract(self, text: tf.Tensor) -> tf.Tensor: - """ - Convert string to a list of integers - Args: - text: string (sequence of characters) - - Returns: - sequence of ints in tf.Tensor - """ - indices = self.subwords.tokenize(text) - indices = indices.merge_dims(0, -1) - return indices.to_tensor() - - def iextract(self, indices: tf.Tensor) -> tf.Tensor: - """ - Convert list of indices to string - Args: - indices: tf.Tensor with dim [B, None] - - Returns: - transcripts: tf.Tensor of dtype tf.string with dim [B] - """ - with tf.device("/CPU:0"): # string data is not supported on GPU - indices = self.normalize_indices(indices) - text = self.subwords.detokenize(indices) - return tf.strings.reduce_join(text, separator=" ", axis=-1) - - @tf.function( - input_signature=[ - tf.TensorSpec([None], dtype=tf.int32) - ] - ) - def indices2upoints(self, indices: tf.Tensor) -> tf.Tensor: - """ - Transform Predicted Indices to Unicode Code Points (for using tflite) - Args: - indices: tf.Tensor of Classes in shape [None] - - Returns: - unicode code points transcript with dtype tf.int32 and shape [None] - """ - with tf.name_scope("indices2upoints"): - text = self.iextract(tf.expand_dims(indices, axis=0)) - return tf.reshape(text, shape=[-1]) - - class SentencePieceFeaturizer(TextFeaturizer): """ Extract text feature based on sentence piece package. @@ -512,7 +403,7 @@ def corpus_iterator(): @classmethod def load_from_file(cls, decoder_config: dict, filename: str = None): if filename is not None: - filename_prefix = os.path.splitext(preprocess_paths(filename))[0] + filename_prefix = os.path.splitext(file_util.preprocess_paths(filename))[0] else: filename_prefix = decoder_config.get("output_path_prefix", None) processor = sp.SentencePieceProcessor() diff --git a/tensorflow_asr/featurizers/wordpiece.py b/tensorflow_asr/featurizers/wordpiece.py deleted file mode 100644 index 56aa599953..0000000000 --- a/tensorflow_asr/featurizers/wordpiece.py +++ /dev/null @@ -1,577 +0,0 @@ -# coding=utf-8 -# Copyright 2021 TF.Text Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Algorithm for learning wordpiece vocabulary.""" - -import re -import collections -from typing import List, Optional - -import tensorflow as tf - -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import check_ops -from tensorflow.python.ops import sort_ops -from tensorflow.python.ops import string_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops.ragged import ragged_string_ops -from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor - -import numpy as np -import tensorflow_text as tft - -Params = collections.namedtuple("Params", [ - "upper_thresh", "lower_thresh", "num_iterations", "max_input_tokens", - "max_token_length", "max_unique_chars", "vocab_size", "slack_ratio", - "include_joiner_token", "joiner", "reserved_tokens" -]) - - -def extract_char_tokens(word_counts): - """Extracts all single-character tokens from word_counts. - Args: - word_counts: list of (string, int) tuples - Returns: - set of single-character strings contained within word_counts - """ - - seen_chars = set() - for word, _ in word_counts: - for char in word: - seen_chars.add(char) - return seen_chars - - -def ensure_all_tokens_exist(input_tokens, output_tokens, include_joiner_token, - joiner): - """Adds all tokens in input_tokens to output_tokens if not already present. - Args: - input_tokens: set of strings (tokens) we want to include - output_tokens: string to int dictionary mapping token to count - include_joiner_token: bool whether to include joiner token - joiner: string used to indicate suffixes - Returns: - string to int dictionary with all tokens in input_tokens included - """ - - for token in input_tokens: - if token not in output_tokens: - output_tokens[token] = 1 - - if include_joiner_token: - joined_token = joiner + token - if joined_token not in output_tokens: - output_tokens[joined_token] = 1 - - return output_tokens - - -def get_split_indices(word, curr_tokens, include_joiner_token, joiner): - """Gets indices for valid substrings of word, for iterations > 0. - For iterations > 0, rather than considering every possible substring, we only - want to consider starting points corresponding to the start of wordpieces in - the current vocabulary. - Args: - word: string we want to split into substrings - curr_tokens: string to int dict of tokens in vocab (from previous iteration) - include_joiner_token: bool whether to include joiner token - joiner: string used to indicate suffixes - Returns: - list of ints containing valid starting indices for word - """ - - indices = [] - start = 0 - while start < len(word): - end = len(word) - while end > start: - subtoken = word[start:end] - # Subtoken includes the joiner token. - if include_joiner_token and start > 0: - subtoken = joiner + subtoken - # If subtoken is part of vocab, "end" is a valid start index. - if subtoken in curr_tokens: - indices.append(end) - break - end -= 1 - - if end == start: - return None - start = end - - return indices - - -def get_search_threshs(word_counts, upper_thresh, lower_thresh): - """Clips the thresholds for binary search based on current word counts. - The upper threshold parameter typically has a large default value that can - result in many iterations of unnecessary search. Thus we clip the upper and - lower bounds of search to the maximum and the minimum wordcount values. - Args: - word_counts: list of (string, int) tuples - upper_thresh: int, upper threshold for binary search - lower_thresh: int, lower threshold for binary search - Returns: - upper_search: int, clipped upper threshold for binary search - lower_search: int, clipped lower threshold for binary search - """ - - counts = [count for _, count in word_counts] - max_count = max(counts) - min_count = min(counts) - - if upper_thresh is None: - upper_search = max_count - else: - upper_search = max_count if max_count < upper_thresh else upper_thresh - - if lower_thresh is None: - lower_search = min_count - else: - lower_search = min_count if min_count > lower_thresh else lower_thresh - - return upper_search, lower_search - - -def get_input_words(word_counts, reserved_tokens, max_token_length): - """Filters out words that are longer than max_token_length or are reserved. - Args: - word_counts: list of (string, int) tuples - reserved_tokens: list of strings - max_token_length: int, maximum length of a token - Returns: - list of (string, int) tuples of filtered wordcounts - """ - - all_counts = [] - - for word, count in word_counts: - if len(word) > max_token_length or word in reserved_tokens: - continue - all_counts.append((word, count)) - - return all_counts - - -def get_allowed_chars(all_counts, max_unique_chars): - """Get the top max_unique_chars characters within our wordcounts. - We want each character to be in the vocabulary so that we can keep splitting - down to the character level if necessary. However, in order not to inflate - our vocabulary with rare characters, we only keep the top max_unique_chars - characters. - Args: - all_counts: list of (string, int) tuples - max_unique_chars: int, maximum number of unique single-character tokens - Returns: - set of strings containing top max_unique_chars characters in all_counts - """ - - char_counts = collections.defaultdict(int) - - for word, count in all_counts: - for char in word: - char_counts[char] += count - - # Sort by count, then alphabetically. - sorted_counts = sorted(sorted(char_counts.items(), key=lambda x: x[0]), - key=lambda x: x[1], reverse=True) - - allowed_chars = set() - for i in range(min(len(sorted_counts), max_unique_chars)): - allowed_chars.add(sorted_counts[i][0]) - return allowed_chars - - -def filter_input_words(all_counts, allowed_chars, max_input_tokens): - """Filters out words with unallowed chars and limits words to max_input_tokens. - Args: - all_counts: list of (string, int) tuples - allowed_chars: list of single-character strings - max_input_tokens: int, maximum number of tokens accepted as input - Returns: - list of (string, int) tuples of filtered wordcounts - """ - # Ensure that the input is sorted so that if `max_input_tokens` is reached - # the least common tokens are dropped. - all_counts = sorted( - all_counts, key=lambda word_and_count: word_and_count[1], reverse=True) - filtered_counts = [] - for word, count in all_counts: - if (max_input_tokens != -1 and - len(filtered_counts) >= max_input_tokens): - break - has_unallowed_chars = False - for char in word: - if char not in allowed_chars: - has_unallowed_chars = True - break - if has_unallowed_chars: - continue - filtered_counts.append((word, count)) - - return filtered_counts - - -def generate_final_vocabulary(reserved_tokens, char_tokens, curr_tokens): - """Generates final vocab given reserved, single-character, and current tokens. - Args: - reserved_tokens: list of strings (tokens) that must be included in vocab - char_tokens: set of single-character strings - curr_tokens: string to int dict mapping token to count - Returns: - list of strings representing final vocabulary - """ - - sorted_char_tokens = sorted(list(char_tokens)) - vocab_char_arrays = [] - vocab_char_arrays.extend(reserved_tokens) - vocab_char_arrays.extend(sorted_char_tokens) - - # Sort by count, then alphabetically. - sorted_tokens = sorted(sorted(curr_tokens.items(), key=lambda x: x[0]), - key=lambda x: x[1], reverse=True) - for token, _ in sorted_tokens: - vocab_char_arrays.append(token) - - seen_tokens = set() - # Adding unique tokens to list to maintain sorted order. - vocab_words = [] - for word in vocab_char_arrays: - if word in seen_tokens: - continue - seen_tokens.add(word) - vocab_words.append(word) - - return vocab_words - - -def learn_with_thresh(word_counts, thresh, params): - """Wordpiece learning algorithm to produce a vocab given frequency threshold. - Args: - word_counts: list of (string, int) tuples - thresh: int, frequency threshold for a token to be included in the vocab - params: Params namedtuple, parameters for learning - Returns: - list of strings, vocabulary generated for the given thresh - """ - - # Set of single-character tokens. - char_tokens = extract_char_tokens(word_counts) - curr_tokens = ensure_all_tokens_exist(char_tokens, {}, - params.include_joiner_token, - params.joiner) - - for iteration in range(params.num_iterations): - subtokens = [dict() for _ in range(params.max_token_length + 1)] - # Populate array with counts of each subtoken. - for word, count in word_counts: - if iteration == 0: - split_indices = range(1, len(word) + 1) - else: - split_indices = get_split_indices(word, curr_tokens, - params.include_joiner_token, - params.joiner) - if not split_indices: - continue - - start = 0 - for index in split_indices: - for end in range(start + 1, len(word) + 1): - subtoken = word[start:end] - length = len(subtoken) - if params.include_joiner_token and start > 0: - subtoken = params.joiner + subtoken - if subtoken in subtokens[length]: - # Subtoken exists, increment count. - subtokens[length][subtoken] += count - else: - # New subtoken, add to dict. - subtokens[length][subtoken] = count - start = index - - next_tokens = {} - # Get all tokens that have a count above the threshold. - for length in range(params.max_token_length, 0, -1): - for token, count in subtokens[length].items(): - if count >= thresh: - next_tokens[token] = count - # Decrement the count of all prefixes. - if len(token) > length: # This token includes the joiner. - joiner_len = len(params.joiner) - for i in range(1 + joiner_len, length + joiner_len): - prefix = token[0:i] - if prefix in subtokens[i - joiner_len]: - subtokens[i - joiner_len][prefix] -= count - else: - for i in range(1, length): - prefix = token[0:i] - if prefix in subtokens[i]: - subtokens[i][prefix] -= count - - # Add back single-character tokens. - curr_tokens = ensure_all_tokens_exist(char_tokens, next_tokens, - params.include_joiner_token, - params.joiner) - - vocab_words = generate_final_vocabulary(params.reserved_tokens, char_tokens, - curr_tokens) - - return vocab_words - - -def learn_binary_search(word_counts, lower, upper, params): - """Performs binary search to find wordcount frequency threshold. - Given upper and lower bounds and a list of (word, count) tuples, performs - binary search to find the threshold closest to producing a vocabulary - of size vocab_size. - Args: - word_counts: list of (string, int) tuples - lower: int, lower bound for binary search - upper: int, upper bound for binary search - params: Params namedtuple, parameters for learning - Returns: - list of strings, vocab that is closest to target vocab_size - """ - thresh = (upper + lower) // 2 - current_vocab = learn_with_thresh(word_counts, thresh, params) - current_vocab_size = len(current_vocab) - - # Allow count to be within k% of the target count, where k is slack ratio. - slack_count = params.slack_ratio * params.vocab_size - if slack_count < 0: - slack_count = 0 - - is_within_slack = (current_vocab_size <= params.vocab_size) and ( - params.vocab_size - current_vocab_size <= slack_count) - - # We"ve created a vocab within our goal range (or, ran out of search space). - if is_within_slack or lower >= upper or thresh <= 1: - return current_vocab - - current_vocab = None - - if current_vocab_size > params.vocab_size: - return learn_binary_search(word_counts, thresh + 1, upper, params) - - else: - return learn_binary_search(word_counts, lower, thresh - 1, params) - - -def count_words(iterable) -> collections.Counter: - """Converts a iterable of arrays of words into a `Counter` of word counts.""" - counts = collections.Counter() - for words in iterable: - # Convert a RaggedTensor to a flat/dense Tensor. - words = getattr(words, "flat_values", words) - # Flatten any dense tensor - words = np.reshape(words, [-1]) - counts.update(words) - - # Decode the words if necessary. - example_word = next(iter(counts.keys())) - if isinstance(example_word, bytes): - counts = collections.Counter( - {word.decode("utf-8"): count for word, count in counts.items()}) - - return counts - - -def learn(word_counts, - vocab_size: int, - reserved_tokens: List[str] = [], - upper_thresh: Optional[int] = int(1e7), - lower_thresh: Optional[int] = 10, - num_iterations: int = 4, - max_input_tokens: Optional[int] = int(5e6), - max_token_length: int = 50, - max_unique_chars: int = 1000, - slack_ratio: float = 0.05, - include_joiner_token: bool = True, - joiner: str = "##") -> List[str]: - """Takes in wordcounts and returns wordpiece vocabulary. - Args: - word_counts: (word, count) pairs as a dictionary, or list of tuples. - vocab_size: The target vocabulary size. This is the maximum size. - reserved_tokens: A list of tokens that must be included in the vocabulary. - upper_thresh: Initial upper bound on the token frequency threshold. - lower_thresh: Initial lower bound on the token frequency threchold. - num_iterations: Number of iterations to run. - max_input_tokens: The maximum number of words in the initial vocabulary. The - words with the lowest counts are discarded. Use `None` or `-1` for "no - maximum". - max_token_length: The maximum token length. Counts for longer words are - discarded. - max_unique_chars: The maximum alphabet size. This prevents rare characters - from inflating the vocabulary. Counts for words containing characters - ouside of the selected alphabet are discarded. - slack_ratio: The maximum deviation acceptable from `vocab_size` for an - acceptable vocabulary. The acceptable range of vocabulary sizes is from - `vocab_size*(1-slack_ratio)` to `vocab_size`. - include_joiner_token: If true, include the `joiner` token in the output - vocabulary. - joiner: The prefix to include on suffix tokens in the output vocabulary. - Usually "##". For example "places" could be tokenized as `["place", - "##s"]`. - Returns: - string, final vocabulary with each word separated by newline - """ - if isinstance(word_counts, dict): - word_counts = word_counts.items() - - params = Params(upper_thresh, lower_thresh, num_iterations, max_input_tokens, - max_token_length, max_unique_chars, vocab_size, slack_ratio, - include_joiner_token, joiner, reserved_tokens) - - upper_search, lower_search = get_search_threshs(word_counts, - params.upper_thresh, - params.lower_thresh) - all_counts = get_input_words(word_counts, params.reserved_tokens, - params.max_token_length) - allowed_chars = get_allowed_chars(all_counts, params.max_unique_chars) - - filtered_counts = filter_input_words(all_counts, allowed_chars, - params.max_input_tokens) - - vocab = learn_binary_search(filtered_counts, lower_search, upper_search, - params) - - return vocab - - -def build_word_counts(corpus_generator): - counts = {} - for transcript in corpus_generator: - words = transcript.split() - for word in words: - if counts.get(word, None) is None: - counts[word] = 0 - else: - counts[word] += 1 - return counts - - -def build_from_corpus(corpus_generator, - target_vocab_size: int, - output_file_path: str, - max_subword_length: int = 50, - max_corpus_chars: int = None, - reserved_tokens: List[str] = [], - num_iterations: int = 4): - word_counts = build_word_counts(corpus_generator) - max_corpus_chars = max_corpus_chars or 1e7 - reserved_tokens = reserved_tokens or [] - vocab = learn(word_counts, target_vocab_size, - reserved_tokens=reserved_tokens, num_iterations=num_iterations, - max_input_tokens=10000000, max_token_length=max_subword_length, max_unique_chars=max_corpus_chars) - with open(output_file_path, "w") as f: - for token in vocab: print(token, file=f) - - -class WordpieceTokenizer(tft.WordpieceTokenizer): - @property - def vocab_size(self): - vocab, _ = self._get_vocab_and_ids() - return tf.shape(vocab)[0].numpy() - - def _get_vocab_and_ids(self): - export = getattr(self._vocab_lookup_table, 'export', None) - if export is None: - table = getattr(self._vocab_lookup_table, '_table') - export = table.export - - vocab, ids = export() # pylint: disable=protected-access - - # `.export` doesn't set the shapes. - vocab = check_ops.ensure_shape(vocab, [ - None, - ]) - ids = check_ops.ensure_shape(ids, [ - None, - ]) - - order = sort_ops.argsort(ids) - - ids = array_ops.gather(ids, order) - vocab = array_ops.gather(vocab, order) - - return vocab, ids - - def detokenize(self, token_ids): - r"""Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words. - >>> import pathlib - >>> pathlib.Path('vocab.txt').write_text( - ... "a b c ##a ##b ##c".replace(' ', '\n')) - >>> wordpiece = text.WordpieceTokenizer('vocab.txt') - >>> token_ids = [[0, 4, 5, 2, 5, 5, 5]] - >>> wordpiece.detokenize(token_ids) - - The word pieces are joined along the innermost axis to make words. So the - result has the same rank as the input, but the innermost axis of the result - indexes words instead of word pieces. - The shape transformation is: `[..., wordpieces] => [..., words]` - When the input shape is `[..., words, wordpieces]` (like the output of - `WordpieceTokenizer.tokenize`) the result's shape is `[..., words, 1]`. - The additional ragged axis can be removed using `words.merge_dims(-2, -1)`. - Note: This method assumes wordpiece IDs are dense on the interval - `[0, vocab_size)`. - Args: - token_ids: A `RaggedTensor` or `Tensor` with an int dtype. Must have - `ndims >= 2` - Returns: - A `RaggedTensor` with dtype `string` and the rank as the input - `token_ids`. - """ - # If there are performance issues with this method or problems with lookup - # tables using sparse IDs see the notes in b/177610044. - vocab, ids = self._get_vocab_and_ids() - - first_is_zero = tf.math.equal(ids[0], 0) - steps = ids[1:] - ids[:-1] - all_one_step = tf.reduce_all(tf.math.equal(steps, 1)) - - check = control_flow_ops.Assert( - first_is_zero & all_one_step, - data=[('`detokenize` only works with vocabulary tables where the ' - 'indices are dense on the interval `[0, vocab_size)`')]) - with ops.control_dependencies([check]): - token_ids = tf.math.minimum( - token_ids, - # Limit the OOV buckets to a single index. - tf.cast(array_ops.size(vocab), token_ids.dtype)) - - # Add the unknown token at that index. - vocab = array_ops.concat([vocab, [self._unknown_token]], axis=0) - - # Lookup the text tokens and join them along the innermost axis. - txt_tokens = array_ops.gather(vocab, token_ids) - - # Ensure the input is Ragged. - if not isinstance(txt_tokens, RaggedTensor): - txt_tokens = RaggedTensor.from_tensor(txt_tokens) - - # Join the tokens along the last axis. - words = string_ops.reduce_join_v2(txt_tokens, axis=-1, separator=' ') - - # Collapse " ##" in all strings to make words. - words = string_ops.regex_replace( - words, ' ' + re.escape(self._suffix_indicator), '') - - # Strip leading and trailing spaces. - words = string_ops.regex_replace(words, '^ +| +$', '') - - # Split on spaces so the last axis is "words". - words = ragged_string_ops.string_split_v2(words, sep=' ') - return words diff --git a/tensorflow_asr/models/keras/conformer.py b/tensorflow_asr/models/keras/conformer.py deleted file mode 100644 index d8aa36f7d1..0000000000 --- a/tensorflow_asr/models/keras/conformer.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .transducer import Transducer -from ..conformer import ConformerEncoder, L2 - - -class Conformer(Transducer): - def __init__(self, - vocabulary_size: int, - encoder_subsampling: dict, - encoder_positional_encoding: str = "sinusoid", - encoder_dmodel: int = 144, - encoder_num_blocks: int = 16, - encoder_head_size: int = 36, - encoder_num_heads: int = 4, - encoder_mha_type: str = "relmha", - encoder_kernel_size: int = 32, - encoder_depth_multiplier: int = 1, - encoder_fc_factor: float = 0.5, - encoder_dropout: float = 0, - encoder_trainable: bool = True, - prediction_embed_dim: int = 512, - prediction_embed_dropout: int = 0, - prediction_num_rnns: int = 1, - prediction_rnn_units: int = 320, - prediction_rnn_type: str = "lstm", - prediction_rnn_implementation: int = 2, - prediction_layer_norm: bool = True, - prediction_projection_units: int = 0, - prediction_trainable: bool = True, - joint_dim: int = 1024, - joint_activation: str = "tanh", - prejoint_linear: bool = True, - postjoint_linear: bool = False, - joint_mode: str = "add", - joint_trainable: bool = True, - kernel_regularizer=L2, - bias_regularizer=L2, - name: str = "conformer", - **kwargs): - super(Conformer, self).__init__( - encoder=ConformerEncoder( - subsampling=encoder_subsampling, - positional_encoding=encoder_positional_encoding, - dmodel=encoder_dmodel, - num_blocks=encoder_num_blocks, - head_size=encoder_head_size, - num_heads=encoder_num_heads, - mha_type=encoder_mha_type, - kernel_size=encoder_kernel_size, - depth_multiplier=encoder_depth_multiplier, - fc_factor=encoder_fc_factor, - dropout=encoder_dropout, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - trainable=encoder_trainable, - name=f"{name}_encoder" - ), - vocabulary_size=vocabulary_size, - embed_dim=prediction_embed_dim, - embed_dropout=prediction_embed_dropout, - num_rnns=prediction_num_rnns, - rnn_units=prediction_rnn_units, - rnn_type=prediction_rnn_type, - rnn_implementation=prediction_rnn_implementation, - layer_norm=prediction_layer_norm, - projection_units=prediction_projection_units, - prediction_trainable=prediction_trainable, - joint_dim=joint_dim, - joint_activation=joint_activation, - prejoint_linear=prejoint_linear, - postjoint_linear=postjoint_linear, - joint_mode=joint_mode, - joint_trainable=joint_trainable, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=name, - **kwargs - ) - self.dmodel = encoder_dmodel - self.time_reduction_factor = self.encoder.conv_subsampling.time_reduction_factor diff --git a/tensorflow_asr/models/keras/contextnet.py b/tensorflow_asr/models/keras/contextnet.py deleted file mode 100644 index 7e43cbbbfb..0000000000 --- a/tensorflow_asr/models/keras/contextnet.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List -import tensorflow as tf - -from .transducer import Transducer -from ..contextnet import ContextNetEncoder, L2 -from ...utils.utils import get_reduced_length - - -class ContextNet(Transducer): - def __init__(self, - vocabulary_size: int, - encoder_blocks: List[dict], - encoder_alpha: float = 0.5, - encoder_trainable: bool = True, - prediction_embed_dim: int = 512, - prediction_embed_dropout: int = 0, - prediction_num_rnns: int = 1, - prediction_rnn_units: int = 320, - prediction_rnn_type: str = "lstm", - prediction_rnn_implementation: int = 2, - prediction_layer_norm: bool = True, - prediction_projection_units: int = 0, - prediction_trainable: bool = True, - joint_dim: int = 1024, - joint_activation: str = "tanh", - prejoint_linear: bool = True, - postjoint_linear: bool = False, - joint_mode: str = "add", - joint_trainable: bool = True, - kernel_regularizer=L2, - bias_regularizer=L2, - name: str = "contextnet", - **kwargs): - super(ContextNet, self).__init__( - encoder=ContextNetEncoder( - blocks=encoder_blocks, - alpha=encoder_alpha, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - trainable=encoder_trainable, - name=f"{name}_encoder" - ), - vocabulary_size=vocabulary_size, - embed_dim=prediction_embed_dim, - embed_dropout=prediction_embed_dropout, - num_rnns=prediction_num_rnns, - rnn_units=prediction_rnn_units, - rnn_type=prediction_rnn_type, - rnn_implementation=prediction_rnn_implementation, - layer_norm=prediction_layer_norm, - projection_units=prediction_projection_units, - prediction_trainable=prediction_trainable, - joint_dim=joint_dim, - joint_activation=joint_activation, - prejoint_linear=prejoint_linear, - postjoint_linear=postjoint_linear, - joint_mode=joint_mode, - joint_trainable=joint_trainable, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=name, - **kwargs - ) - self.dmodel = self.encoder.blocks[-1].dmodel - self.time_reduction_factor = 1 - for block in self.encoder.blocks: self.time_reduction_factor *= block.time_reduction_factor - - def call(self, inputs, training=False, **kwargs): - enc = self.encoder([inputs["input"], inputs["input_length"]], training=training, **kwargs) - pred = self.predict_net([inputs["prediction"], inputs["prediction_length"]], training=training, **kwargs) - outputs = self.joint_net([enc, pred], training=training, **kwargs) - return { - "logit": outputs, - "logit_length": get_reduced_length(inputs["input_length"], self.time_reduction_factor) - } - - def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor): - with tf.name_scope(f"{self.name}_encoder"): - input_length = tf.expand_dims(tf.shape(features)[0], axis=0) - outputs = tf.expand_dims(features, axis=0) - outputs = self.encoder([outputs, input_length], training=False) - return tf.squeeze(outputs, axis=0) - - # -------------------------------- GREEDY ------------------------------------- - - @tf.function - def recognize(self, - features: tf.Tensor, - input_length: tf.Tensor, - parallel_iterations: int = 10, - swap_memory: bool = True): - """ - RNN Transducer Greedy decoding - Args: - features (tf.Tensor): a batch of padded extracted features - - Returns: - tf.Tensor: a batch of decoded transcripts - """ - encoded = self.encoder([features, input_length], training=False) - return self._perform_greedy_batch(encoded, input_length, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) - - def recognize_tflite(self, signal, predicted, prediction_states): - """ - Function to convert to tflite using greedy decoding (default streaming mode) - Args: - signal: tf.Tensor with shape [None] indicating a single audio signal - predicted: last predicted character with shape [] - prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P] - - Return: - transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32 - predicted: last predicted character with shape [] - encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P] - prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P] - """ - features = self.speech_featurizer.tf_extract(signal) - encoded = self.encoder_inference(features, tf.shape(features)[0]) - hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states) - transcript = self.text_featurizer.indices2upoints(hypothesis.prediction) - return transcript, hypothesis.index, hypothesis.states - - def recognize_tflite_with_timestamp(self, signal, predicted, states): - features = self.speech_featurizer.tf_extract(signal) - encoded = self.encoder_inference(features, tf.shape(features)[0]) - hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states) - indices = self.text_featurizer.normalize_indices(hypothesis.prediction) - upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1)) # [None, max_subword_length] - - num_samples = tf.cast(tf.shape(signal)[0], dtype=tf.float32) - total_time_reduction_factor = self.time_reduction_factor * self.speech_featurizer.frame_step - - stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32) - stime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32) - - etime = tf.range(total_time_reduction_factor, num_samples, delta=total_time_reduction_factor, dtype=tf.float32) - etime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32) - - non_blank = tf.where(tf.not_equal(upoints, 0)) - non_blank_transcript = tf.gather_nd(upoints, non_blank) - non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank) - non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank) - - return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, hypothesis.states - - # -------------------------------- BEAM SEARCH ------------------------------------- - - @tf.function - def recognize_beam(self, - features: tf.Tensor, - input_length: tf.Tensor, - lm: bool = False, - parallel_iterations: int = 10, - swap_memory: bool = True): - """ - RNN Transducer Beam Search - Args: - features (tf.Tensor): a batch of padded extracted features - lm (bool, optional): whether to use language model. Defaults to False. - - Returns: - tf.Tensor: a batch of decoded transcripts - """ - encoded = self.encoder([features, input_length], training=False) - return self._perform_beam_search_batch(encoded, input_length, lm, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) diff --git a/tensorflow_asr/models/keras/ctc.py b/tensorflow_asr/models/keras/ctc.py deleted file mode 100644 index a21eff502b..0000000000 --- a/tensorflow_asr/models/keras/ctc.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -from tensorflow.keras import mixed_precision as mxp - -from ..ctc import CtcModel as BaseCtcModel -from ...utils.utils import get_reduced_length -from ...losses.keras.ctc_losses import CtcLoss - - -class CtcModel(BaseCtcModel): - """ Keras CTC Model Warper """ - @property - def metrics(self): - return [self.loss_metric] - - def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): - loss = CtcLoss(blank=blank, global_batch_size=global_batch_size) - self.use_loss_scale = use_loss_scale - if self.use_loss_scale: - optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") - self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32) - super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) - - def train_step(self, batch): - x, y_true = batch - with tf.GradientTape() as tape: - logit = self(x["input"], training=True) - y_pred = { - "logit": logit, - "logit_length": get_reduced_length(x["input_length"], self.time_reduction_factor) - } - loss = self.loss(y_true, y_pred) - if self.use_loss_scale: - scaled_loss = self.optimizer.get_scaled_loss(loss) - if self.use_loss_scale: - scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights) - gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) - else: - gradients = tape.gradient(loss, self.trainable_weights) - self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} - - def test_step(self, batch): - x, y_true = batch - logit = self(x["input"], training=False) - y_pred = { - "logit": logit, - "logit_length": get_reduced_length(x["input_length"], self.time_reduction_factor) - } - loss = self.loss(y_true, y_pred) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} diff --git a/tensorflow_asr/models/keras/deepspeech2.py b/tensorflow_asr/models/keras/deepspeech2.py deleted file mode 100644 index 0c685e87c5..0000000000 --- a/tensorflow_asr/models/keras/deepspeech2.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .ctc import CtcModel -from ..deepspeech2 import ConvModule, RnnModule, FcModule - - -class DeepSpeech2(CtcModel): - def __init__(self, - vocabulary_size: int, - conv_type: str = "conv2d", - conv_kernels: list = [[11, 41], [11, 21], [11, 21]], - conv_strides: list = [[2, 2], [1, 2], [1, 2]], - conv_filters: list = [32, 32, 96], - conv_dropout: float = 0.1, - rnn_nlayers: int = 5, - rnn_type: str = "lstm", - rnn_units: int = 1024, - rnn_bidirectional: bool = True, - rnn_rowconv: int = 0, - rnn_dropout: float = 0.1, - fc_nlayers: int = 0, - fc_units: int = 1024, - fc_dropout: float = 0.1, - name: str = "deepspeech2", - **kwargs): - super(DeepSpeech2, self).__init__(name=name, **kwargs) - - self.conv_module = ConvModule( - conv_type=conv_type, - kernels=conv_kernels, - strides=conv_strides, - filters=conv_filters, - dropout=conv_dropout, - name=f"{self.name}_conv_module" - ) - - self.rnn_module = RnnModule( - nlayers=rnn_nlayers, - rnn_type=rnn_type, - units=rnn_units, - bidirectional=rnn_bidirectional, - rowconv=rnn_rowconv, - dropout=rnn_dropout, - name=f"{self.name}_rnn_module" - ) - - self.fc_module = FcModule( - nlayers=fc_nlayers, - units=fc_units, - dropout=fc_dropout, - vocabulary_size=vocabulary_size, - name=f"{self.name}_fc_module" - ) - - self.time_reduction_factor = self.conv_module.reduction_factor - - def call(self, inputs, training=False, **kwargs): - outputs = self.conv_module(inputs, training=training, **kwargs) - outputs = self.rnn_module(outputs, training=training, **kwargs) - outputs = self.fc_module(outputs, training=training, **kwargs) - return outputs - - def summary(self, line_length=100, **kwargs): - self.conv_module.summary(line_length=line_length, **kwargs) - self.rnn_module.summary(line_length=line_length, **kwargs) - self.fc_module.summary(line_length=line_length, **kwargs) - super(DeepSpeech2, self).summary(line_length=line_length, **kwargs) - - def get_config(self): - conf = super(DeepSpeech2, self).get_config() - conf.update(self.conv_module.get_config()) - conf.update(self.rnn_module.get_config()) - conf.update(self.fc_module.get_config()) - return conf diff --git a/tensorflow_asr/models/keras/jasper.py b/tensorflow_asr/models/keras/jasper.py deleted file mode 100644 index b19010d1b0..0000000000 --- a/tensorflow_asr/models/keras/jasper.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf - -from .ctc import CtcModel -from ..jasper import Reshape, JasperBlock, JasperSubBlock - - -class Jasper(CtcModel): - def __init__(self, - vocabulary_size: int, - dense: bool = False, - first_additional_block_channels: int = 256, - first_additional_block_kernels: int = 11, - first_additional_block_strides: int = 2, - first_additional_block_dilation: int = 1, - first_additional_block_dropout: int = 0.2, - nsubblocks: int = 5, - block_channels: list = [256, 384, 512, 640, 768], - block_kernels: list = [11, 13, 17, 21, 25], - block_dropout: list = [0.2, 0.2, 0.2, 0.3, 0.3], - second_additional_block_channels: int = 896, - second_additional_block_kernels: int = 1, - second_additional_block_strides: int = 1, - second_additional_block_dilation: int = 2, - second_additional_block_dropout: int = 0.4, - third_additional_block_channels: int = 1024, - third_additional_block_kernels: int = 1, - third_additional_block_strides: int = 1, - third_additional_block_dilation: int = 1, - third_additional_block_dropout: int = 0.4, - kernel_regularizer=None, - bias_regularizer=None, - name: str = "jasper", - **kwargs): - super(Jasper, self).__init__(name=name, **kwargs) - - assert len(block_channels) == len(block_kernels) == len(block_dropout) - - self.reshape = Reshape(name=f"{self.name}_reshape") - - self.first_additional_block = JasperSubBlock( - channels=first_additional_block_channels, - kernels=first_additional_block_kernels, - strides=first_additional_block_strides, - dropout=first_additional_block_dropout, - dilation=first_additional_block_dilation, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_first_block" - ) - - self.blocks = [ - JasperBlock( - nsubblocks=nsubblocks, - channels=block_channels[i], - kernels=block_kernels[i], - dropout=block_dropout[i], - dense=dense, - nresiduals=(i + 1) if dense else 1, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_block_{i}" - ) for i in range(len(block_channels)) - ] - - self.second_additional_block = JasperSubBlock( - channels=second_additional_block_channels, - kernels=second_additional_block_kernels, - strides=second_additional_block_strides, - dropout=second_additional_block_dropout, - dilation=second_additional_block_dilation, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_second_block" - ) - - self.third_additional_block = JasperSubBlock( - channels=third_additional_block_channels, - kernels=third_additional_block_kernels, - strides=third_additional_block_strides, - dropout=third_additional_block_dropout, - dilation=third_additional_block_dilation, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_third_block" - ) - - self.last_block = tf.keras.layers.Conv1D( - filters=vocabulary_size, kernel_size=1, - strides=1, padding="same", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{self.name}_last_block" - ) - - self.time_reduction_factor = self.first_additional_block.reduction_factor - self.time_reduction_factor *= self.second_additional_block.reduction_factor - self.time_reduction_factor *= self.third_additional_block.reduction_factor - - def call(self, inputs, training=False, **kwargs): - outputs = self.reshape(inputs) - outputs = self.first_additional_block(outputs, training=training, **kwargs) - - residuals = [] - for block in self.blocks: - outputs, residuals = block([outputs, residuals], training=training, **kwargs) - - outputs = self.second_additional_block(outputs, training=training, **kwargs) - outputs = self.third_additional_block(outputs, training=training, **kwargs) - outputs = self.last_block(outputs, training=training, **kwargs) - return outputs - - def summary(self, line_length=100, **kwargs): - super(Jasper, self).summary(line_length=line_length, **kwargs) - - def get_config(self): - conf = self.reshape.get_config() - conf.update(self.first_additional_block.get_config()) - for block in self.blocks: - conf.update(block.get_config()) - conf.update(self.second_additional_block.get_config()) - conf.update(self.third_additional_block.get_config()) - conf.update(self.last_block.get_config()) - return conf diff --git a/tensorflow_asr/models/keras/streaming_transducer.py b/tensorflow_asr/models/keras/streaming_transducer.py deleted file mode 100644 index 8ebb81e279..0000000000 --- a/tensorflow_asr/models/keras/streaming_transducer.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf - - -from .transducer import Transducer -from ..streaming_transducer import StreamingTransducerEncoder -from ...utils.utils import shape_list - - -class StreamingTransducer(Transducer): - def __init__(self, - vocabulary_size: int, - encoder_reductions: dict = {0: 3, 1: 2}, - encoder_dmodel: int = 640, - encoder_nlayers: int = 8, - encoder_rnn_type: str = "lstm", - encoder_rnn_units: int = 2048, - encoder_layer_norm: bool = True, - encoder_trainable: bool = True, - prediction_embed_dim: int = 320, - prediction_embed_dropout: float = 0, - prediction_num_rnns: int = 2, - prediction_rnn_units: int = 2048, - prediction_rnn_type: str = "lstm", - prediction_layer_norm: bool = True, - prediction_projection_units: int = 640, - prediction_trainable: bool = True, - joint_dim: int = 640, - joint_activation: str = "tanh", - prejoint_linear: bool = True, - postjoint_linear: bool = False, - joint_mode: str = "add", - joint_trainable: bool = True, - kernel_regularizer = None, - bias_regularizer = None, - name = "StreamingTransducer", - **kwargs): - super(StreamingTransducer, self).__init__( - encoder=StreamingTransducerEncoder( - reductions=encoder_reductions, - dmodel=encoder_dmodel, - nlayers=encoder_nlayers, - rnn_type=encoder_rnn_type, - rnn_units=encoder_rnn_units, - layer_norm=encoder_layer_norm, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - trainable=encoder_trainable, - name=f"{name}_encoder" - ), - vocabulary_size=vocabulary_size, - embed_dim=prediction_embed_dim, - embed_dropout=prediction_embed_dropout, - num_rnns=prediction_num_rnns, - rnn_units=prediction_rnn_units, - rnn_type=prediction_rnn_type, - layer_norm=prediction_layer_norm, - projection_units=prediction_projection_units, - prediction_trainable=prediction_trainable, - joint_dim=joint_dim, - joint_activation=joint_activation, - prejoint_linear=prejoint_linear, - postjoint_linear=postjoint_linear, - joint_mode=joint_mode, - joint_trainable=joint_trainable, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=name, **kwargs - ) - self.time_reduction_factor = self.encoder.time_reduction_factor - - def encoder_inference(self, features: tf.Tensor, states: tf.Tensor): - """Infer function for encoder (or encoders) - - Args: - features (tf.Tensor): features with shape [T, F, C] - states (tf.Tensor): previous states of encoders with shape [num_rnns, 1 or 2, 1, P] - - Returns: - tf.Tensor: output of encoders with shape [T, E] - tf.Tensor: states of encoders with shape [num_rnns, 1 or 2, 1, P] - """ - with tf.name_scope(f"{self.name}_encoder"): - outputs = tf.expand_dims(features, axis=0) - outputs, new_states = self.encoder.recognize(outputs, states) - return tf.squeeze(outputs, axis=0), new_states - - # -------------------------------- GREEDY ------------------------------------- - - @tf.function - def recognize(self, - features: tf.Tensor, - input_length: tf.Tensor, - parallel_iterations: int = 10, - swap_memory: bool = True): - """ - RNN Transducer Greedy decoding - Args: - features (tf.Tensor): a batch of padded extracted features - - Returns: - tf.Tensor: a batch of decoded transcripts - """ - batch_size, _, _, _ = shape_list(features) - encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) - return self._perform_greedy_batch(encoded, input_length, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) - - def recognize_tflite(self, signal, predicted, encoder_states, prediction_states): - """ - Function to convert to tflite using greedy decoding (default streaming mode) - Args: - signal: tf.Tensor with shape [None] indicating a single audio signal - predicted: last predicted character with shape [] - encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P] - prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P] - - Return: - transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32 - predicted: last predicted character with shape [] - encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P] - prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P] - """ - features = self.speech_featurizer.tf_extract(signal) - encoded, new_encoder_states = self.encoder_inference(features, encoder_states) - hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states) - transcript = self.text_featurizer.indices2upoints(hypothesis.prediction) - return transcript, hypothesis.index, new_encoder_states, hypothesis.states - - def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, prediction_states): - features = self.speech_featurizer.tf_extract(signal) - encoded, new_encoder_states = self.encoder_inference(features, encoder_states) - hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states) - indices = self.text_featurizer.normalize_indices(hypothesis.prediction) - upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1)) # [None, max_subword_length] - - num_samples = tf.cast(tf.shape(signal)[0], dtype=tf.float32) - total_time_reduction_factor = self.time_reduction_factor * self.speech_featurizer.frame_step - - stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32) - stime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32) - - etime = tf.range(total_time_reduction_factor, num_samples, delta=total_time_reduction_factor, dtype=tf.float32) - etime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32) - - non_blank = tf.where(tf.not_equal(upoints, 0)) - non_blank_transcript = tf.gather_nd(upoints, non_blank) - non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank) - non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank) - - return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, new_encoder_states, hypothesis.states - - # -------------------------------- BEAM SEARCH ------------------------------------- - - @tf.function - def recognize_beam(self, - features: tf.Tensor, - input_length: tf.Tensor, - lm: bool = False, - parallel_iterations: int = 10, - swap_memory: bool = True): - """ - RNN Transducer Beam Search - Args: - features (tf.Tensor): a batch of padded extracted features - lm (bool, optional): whether to use language model. Defaults to False. - - Returns: - tf.Tensor: a batch of decoded transcripts - """ - batch_size, _, _, _ = shape_list(features) - encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) - return self._perform_beam_search_batch(encoded, input_length, lm, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) - - # -------------------------------- TFLITE ------------------------------------- - - def make_tflite_function(self, timestamp: bool = True): - tflite_func = self.recognize_tflite_with_timestamp if timestamp else self.recognize_tflite - return tf.function( - tflite_func, - input_signature=[ - tf.TensorSpec([None], dtype=tf.float32), - tf.TensorSpec([], dtype=tf.int32), - tf.TensorSpec(self.encoder.get_initial_state().get_shape(), dtype=tf.float32), - tf.TensorSpec(self.predict_net.get_initial_state().get_shape(), dtype=tf.float32) - ] - ) diff --git a/tensorflow_asr/models/keras/transducer.py b/tensorflow_asr/models/keras/transducer.py deleted file mode 100644 index 269ad65b90..0000000000 --- a/tensorflow_asr/models/keras/transducer.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" https://arxiv.org/pdf/1811.06621.pdf """ - -import tensorflow as tf -from tensorflow.keras import mixed_precision as mxp - -from ..transducer import Transducer as BaseTransducer -from ...utils.utils import get_reduced_length -from ...losses.keras.rnnt_losses import RnntLoss - - -class Transducer(BaseTransducer): - """ Keras Transducer Model Warper """ - @property - def metrics(self): - return [self.loss_metric] - - def _build(self, input_shape, prediction_shape=[None], batch_size=None): - inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32) - input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) - pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32) - pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) - self({ - "input": inputs, - "input_length": input_length, - "prediction": pred, - "prediction_length": pred_length - }, training=False) - - def call(self, inputs, training=False, **kwargs): - features = inputs["input"] - prediction = inputs["prediction"] - prediction_length = inputs["prediction_length"] - enc = self.encoder(features, training=training, **kwargs) - pred = self.predict_net([prediction, prediction_length], training=training, **kwargs) - outputs = self.joint_net([enc, pred], training=training, **kwargs) - return { - "logit": outputs, - "logit_length": get_reduced_length(inputs["input_length"], self.time_reduction_factor) - } - - def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs): - loss = RnntLoss(blank=blank, global_batch_size=global_batch_size) - self.use_loss_scale = use_loss_scale - if self.use_loss_scale: - optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") - self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32) - super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) - - def train_step(self, batch): - x, y_true = batch - with tf.GradientTape() as tape: - y_pred = self({ - "input": x["input"], - "input_length": x["input_length"], - "prediction": x["prediction"], - "prediction_length": x["prediction_length"], - }, training=True) - loss = self.loss(y_true, y_pred) - if self.use_loss_scale: - scaled_loss = self.optimizer.get_scaled_loss(loss) - if self.use_loss_scale: - scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights) - gradients = self.optimizer.get_unscaled_gradients(scaled_gradients) - else: - gradients = tape.gradient(loss, self.trainable_weights) - self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} - - def test_step(self, batch): - x, y_true = batch - y_pred = self({ - "input": x["input"], - "input_length": x["input_length"], - "prediction": x["prediction"], - "prediction_length": x["prediction_length"], - }, training=False) - loss = self.loss(y_true, y_pred) - self.loss_metric.update_state(loss) - return {m.name: m.result() for m in self.metrics} diff --git a/tensorflow_asr/runners/README.md b/tensorflow_asr/runners/README.md deleted file mode 100644 index d7eebe3b27..0000000000 --- a/tensorflow_asr/runners/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Runners :wink: - -## Trainers - -The trainers use `BaseTrainer` for training models. To create a custom trainer, define these following methods: - -1. Set metrics (`train_metrics` and `eval_metrics`) -2. `_train_step` to process batch of data when training -3. `_eval_step` to process batch of data when validating -4. `compile` for loading built models, optimizers and any custom variables -5. Optionally define `save_model_weights` for save latest model weights every checkpoint. - -## Testers - -_Testers only run in 1 GPU_ because we have to use `tf.py_function` or `tf.numpy_function` to calculate WER, CER, Semetrics, ... - -The testers for **acoustic** models are combined into single class `BaseTester`. Therefore you don't need to define any custom tester for **acoustic** models. - -The `BaseTester` do the steps as follows: - -1. Load test dataset. -2. Run testing with `greedy` decoding, `beamsearch` decoding and if you provide an `Scorer` in `TextFeaturizer.scorer`, it will decode `beamsearch_with_lm`, otherwise another `beamsearch`. -3. The result of `greedy`, `beamsearch` and `beamsearch_with_lm` are written to the `test.tsv` file in the `outdir` configured in `.yml` config file. -4. Finish testing by reloading whole `test.tsv` and calculate `WER, CER` from it, the results are printed to stdout. diff --git a/tensorflow_asr/runners/__init__.py b/tensorflow_asr/runners/__init__.py deleted file mode 100644 index b750f2499e..0000000000 --- a/tensorflow_asr/runners/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tensorflow as tf - - -def save_from_checkpoint(func, - outdir: str, - max_to_keep: int = 10, - **kwargs): - """ - Function to save models from latest saved checkpoint - Args: - func: function takes inputs as **kwargs and performs when checkpoint is found - outdir: logging directory - max_to_keep: number of checkpoints to keep - **kwargs: contains built models, optimizers - """ - steps = tf.Variable(0, trainable=False, dtype=tf.int64) # Step must be int64 - epochs = tf.Variable(1, trainable=False) - checkpoint_dir = os.path.join(outdir, "checkpoints") - if not os.path.exists(checkpoint_dir): - raise ValueError(f"checkpoint directory not found: {checkpoint_dir}") - ckpt = tf.train.Checkpoint(steps=steps, epochs=epochs, **kwargs) - ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=max_to_keep) - if ckpt_manager.latest_checkpoint: - ckpt.restore(ckpt_manager.latest_checkpoint) - func(**kwargs) - else: - raise ValueError("no lastest checkpoint found") diff --git a/tensorflow_asr/runners/base_runners.py b/tensorflow_asr/runners/base_runners.py deleted file mode 100644 index 696e0718ca..0000000000 --- a/tensorflow_asr/runners/base_runners.py +++ /dev/null @@ -1,498 +0,0 @@ -# This implementation is inspired from -# https://github.com/dathudeptrai/TensorflowTTS/blob/master/tensorflow_tts/trainers/base_trainer.py -# Copyright 2020 Minh Nguyen (@dathudeptrai) Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import abc -import os -from tqdm import tqdm -from colorama import Fore - -import numpy as np -import tensorflow as tf - -from ..configs.config import RunningConfig -from ..utils.utils import get_num_batches, bytes_to_string, get_reduced_length -from ..utils.metrics import ErrorRate, wer, cer - - -class BaseRunner(metaclass=abc.ABCMeta): - """ Customized runner module for all models """ - - def __init__(self, config: RunningConfig): - self.config = config - # Writers - self.writers = { - "train": tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", "train")), - "eval": tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", "eval")) - } - - def add_writer(self, stage: str): - self.writers[stage] = tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", stage)) - - def _write_to_tensorboard(self, - list_metrics: dict, - step: any, - stage: str = "train"): - """Write variables to tensorboard.""" - writer = self.writers.get(stage, None) - - if writer is None: - raise ValueError(f"Missing writer for stage {stage}") - - with writer.as_default(): - for key, value in list_metrics.items(): - if isinstance(value, tf.keras.metrics.Metric): - tf.summary.scalar(key, value.result(), step=step) - else: - tf.summary.scalar(key, value, step=step) - writer.flush() - - -class BaseTrainer(BaseRunner): - """Customized trainer module for all models.""" - - def __init__(self, - config: RunningConfig, - strategy: tf.distribute.Strategy = None): - # Configurations - super(BaseTrainer, self).__init__(config) - self.set_strategy(strategy) - # Steps and Epochs start from 0 - # Step must be int64 to use tf.summary - self.steps = tf.Variable(0, trainable=False, dtype=tf.int64) - self.train_steps_per_epoch = None - self.eval_steps_per_epoch = None - # Dataset - self.train_data_loader = None - self.eval_data_loader = None - - with self.strategy.scope(): - self.set_train_metrics() - self.set_eval_metrics() - - @property - def total_train_steps(self): - if self.train_steps_per_epoch is None: return None - return self.config.num_epochs * self.train_steps_per_epoch - - @property - def epochs(self): - if self.train_steps_per_epoch is None: return 1 - return (self.steps.numpy() // self.train_steps_per_epoch) + 1 - - # -------------------------------- GET SET ------------------------------------- - - @abc.abstractmethod - def set_train_metrics(self): - self.train_metrics = {} - raise NotImplementedError() - - @abc.abstractmethod - def set_eval_metrics(self): - self.eval_metrics = {} - raise NotImplementedError() - - def set_strategy(self, strategy=None): - if strategy is None: - gpus = tf.config.experimental.list_physical_devices('GPU') - self.strategy = tf.distribute.OneDeviceStrategy("/GPU:0") if gpus else \ - tf.distribute.OneDeviceStrategy("/CPU:0") - else: - self.strategy = strategy - - def set_train_data_loader(self, train_dataset, train_bs=None, train_acs=None): - """ Set train data loader (MUST). """ - if not train_bs: train_bs = self.config.batch_size - self.global_batch_size = train_bs * self.strategy.num_replicas_in_sync - self.config.batch_size = train_bs # Update batch size fed from arguments - - if not train_acs: train_acs = self.config.accumulation_steps - self.config.accumulation_steps = train_acs # update accum steps fed from arguments - - self.train_data = train_dataset.create(self.global_batch_size) - self.train_data_loader = self.strategy.experimental_distribute_dataset(self.train_data) - if hasattr(self, "accumulation") and train_dataset.total_steps is not None: - self.train_steps_per_epoch = train_dataset.total_steps // self.config.accumulation_steps - else: - self.train_steps_per_epoch = train_dataset.total_steps - - def set_eval_data_loader(self, eval_dataset, eval_bs=None): - """ Set eval data loader (MUST). - Eval batch might be significantly greater than train batch """ - if eval_dataset is None: - self.eval_data = None - self.eval_data_loader = None - return - if not eval_bs: eval_bs = self.config.batch_size - self.eval_data = eval_dataset.create(eval_bs * self.strategy.num_replicas_in_sync) - if self.eval_data is None: - self.eval_data_loader = None - else: - self.eval_data_loader = self.strategy.experimental_distribute_dataset(self.eval_data) - self.eval_steps_per_epoch = eval_dataset.total_steps - - # -------------------------------- CHECKPOINTS ------------------------------------- - - def create_checkpoint_manager(self, max_to_keep=10, **kwargs): - """Create checkpoint management.""" - with self.strategy.scope(): - self.ckpt = tf.train.Checkpoint(steps=self.steps, **kwargs) - checkpoint_dir = os.path.join(self.config.outdir, "checkpoints") - if not tf.io.gfile.exists(checkpoint_dir): tf.io.gfile.makedirs(checkpoint_dir) - self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, checkpoint_dir, max_to_keep=max_to_keep) - - def save_checkpoint(self): - """Save checkpoint.""" - with self.strategy.scope(): - self.ckpt_manager.save() - self.train_progbar.set_postfix_str("Successfully Saved Checkpoint") - - def load_checkpoint(self): - """Load checkpoint.""" - with self.strategy.scope(): - if self.ckpt_manager.latest_checkpoint: - self.ckpt.restore(self.ckpt_manager.latest_checkpoint) - - def save_model_weights(self): - """ Save the latest model's weights at each save_interval_steps """ - pass - - # -------------------------------- RUNNING ------------------------------------- - - def _finished(self): - if self.train_steps_per_epoch is None: - return False - return self.steps.numpy() >= self.total_train_steps - - def run(self): - """Run training.""" - if self.steps.numpy() > 0: tf.print("Resume training ...") - - self.train_progbar = tqdm( - initial=self.steps.numpy(), unit="batch", total=self.total_train_steps, - position=0, leave=True, - bar_format="{desc} |%s{bar:20}%s{r_bar}" % (Fore.GREEN, Fore.RESET), - desc="[Train]" - ) - - while not self._finished(): - self._train_epoch() - - # save and evaluate when training is done - self.save_checkpoint() - self.save_model_weights() - self.log_train_metrics() - self._eval_epoch() - - self.train_progbar.close() - print("> Finish training") - - def _train_epoch(self): - """Train model one epoch.""" - train_iterator = iter(self.train_data_loader) - train_steps = 0 - while True: - try: - self._train_function(train_iterator) # Run train step - except StopIteration: - break - except tf.errors.OutOfRangeError: - break - except Exception as e: - raise e - - # Update steps - self.steps.assign_add(1) - self.train_progbar.update(1) - train_steps += 1 - - # Run save checkpoint - self._check_save_interval() - - # Print epoch info - self.train_progbar.set_description_str(f"[Train] [Epoch {self.epochs}/{self.config.num_epochs}]") - - # Print train info to progress bar - self._print_train_metrics(self.train_progbar) - - # Run logging - self._check_log_interval() - - # Run evaluation - self._check_eval_interval() - - self.train_steps_per_epoch = train_steps - self.train_progbar.total = self.total_train_steps - self.train_progbar.refresh() - - @tf.function - def _train_function(self, iterator): - batch = next(iterator) - self.strategy.run(self._train_step, args=(batch,)) - - @abc.abstractmethod - def _train_step(self, batch): - """ One step training. Does not return anything""" - raise NotImplementedError() - - def _eval_epoch(self): - """One epoch evaluation.""" - if not self.eval_data_loader: return - - print("\n> Start evaluation ...") - - for metric in self.eval_metrics.keys(): - self.eval_metrics[metric].reset_states() - - eval_progbar = tqdm( - initial=0, total=self.eval_steps_per_epoch, unit="batch", - position=0, leave=True, - bar_format="{desc} |%s{bar:20}%s{r_bar}" % (Fore.BLUE, Fore.RESET), - desc=f"[Eval] [Step {self.steps.numpy()}]" - ) - eval_iterator = iter(self.eval_data_loader) - eval_steps = 0 - - while True: - try: - self._eval_function(eval_iterator) # Run eval step - except StopIteration: - break - except tf.errors.OutOfRangeError: - break - except Exception as e: - raise e - - # Update steps - eval_progbar.update(1) - eval_steps += 1 - - # Print eval info to progress bar - self._print_eval_metrics(eval_progbar) - - self.eval_steps_per_epoch = eval_steps - eval_progbar.close() - # Write to tensorboard - self._write_to_tensorboard(self.eval_metrics, self.steps, stage="eval") - - print("> End evaluation ...") - - @tf.function - def _eval_function(self, iterator): - batch = next(iterator) - self.strategy.run(self._eval_step, args=(batch,)) - - @abc.abstractmethod - def _eval_step(self, batch): - """One eval step. Does not return anything""" - raise NotImplementedError() - - @abc.abstractmethod - def compile(self, *args, **kwargs): - """ Function to initialize models and optimizers """ - raise NotImplementedError() - - def fit(self, train_dataset, eval_dataset=None, train_bs=None, train_acs=None, eval_bs=None): - """ Function run start training, including executing "run" func """ - self.set_train_data_loader(train_dataset, train_bs, train_acs) - self.set_eval_data_loader(eval_dataset, eval_bs) - self.load_checkpoint() - self.run() - - # -------------------------------- LOGGING ------------------------------------- - - def log_train_metrics(self): - self._write_to_tensorboard(self.train_metrics, self.steps, stage="train") - """Reset train metrics after save it to tensorboard.""" - for metric in self.train_metrics.keys(): - self.train_metrics[metric].reset_states() - - def _check_log_interval(self): - """Save log interval.""" - if (self.steps.numpy() % self.config.log_interval_steps == 0): - self.log_train_metrics() - - def _check_save_interval(self): - """Save log interval.""" - if (self.steps.numpy() % self.config.save_interval_steps == 0): - self.save_checkpoint() - self.save_model_weights() - - def _check_eval_interval(self): - """Save log interval.""" - if (self.steps.numpy() % self.config.eval_interval_steps == 0): - self._eval_epoch() - - # -------------------------------- UTILS ------------------------------------- - - def _print_train_metrics(self, progbar): - result_dict = {key: str(value.result().numpy()) for key, value in self.train_metrics.items()} - progbar.set_postfix(result_dict) - - def _print_eval_metrics(self, progbar): - result_dict = {key: str(value.result().numpy()) for key, value in self.eval_metrics.items()} - progbar.set_postfix(result_dict) - - # -------------------------------- END ------------------------------------- - - -class BaseTester(BaseRunner): - """ Customized tester module for all models - This tester model will write results to test.tsv file in outdir - After writing finished, it will calculate testing metrics - """ - - def __init__(self, - config: RunningConfig, - output_name: str = "test"): - super(BaseTester, self).__init__(config) - self.test_data_loader = None - self.processed_records = 0 - - self.output_file_path = os.path.join(self.config.outdir, f"{output_name}.tsv") - self.test_metrics = { - "beam_wer": ErrorRate(func=wer, name="test_beam_wer", dtype=tf.float32), - "beam_cer": ErrorRate(func=cer, name="test_beam_cer", dtype=tf.float32), - "beam_lm_wer": ErrorRate(func=wer, name="test_beam_lm_wer", dtype=tf.float32), - "beam_lm_cer": ErrorRate(func=cer, name="test_beam_lm_cer", dtype=tf.float32), - "greed_wer": ErrorRate(func=wer, name="test_greed_wer", dtype=tf.float32), - "greed_cer": ErrorRate(func=cer, name="test_greed_cer", dtype=tf.float32) - } - - def set_output_file(self, batch_size: int = 1): - if not batch_size: batch_size = self.config.batch_size - if os.path.exists(self.output_file_path): - with open(self.output_file_path, "r", encoding="utf-8") as out: - self.processed_records = get_num_batches(len(out.read().splitlines()) - 1, batch_size=batch_size, - drop_remainders=False) - else: - with open(self.output_file_path, "w") as out: - out.write("PATH\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\tBEAMSEARCHLM\n") - - def set_test_data_loader(self, test_dataset, batch_size=None): - """Set train data loader (MUST).""" - if not batch_size: batch_size = self.config.batch_size - self.test_data_loader = test_dataset.create(batch_size) - self.total_steps = test_dataset.total_steps - - # -------------------------------- RUNNING ------------------------------------- - - def compile(self, trained_model: tf.keras.Model): - """ Set loaded trained model """ - if not hasattr(trained_model, "speech_featurizer"): - raise AttributeError("Please do 'add_featurizers' before testing") - self.model = trained_model - - def run(self, test_dataset, batch_size=None): - self.set_output_file(batch_size=batch_size) - self.set_test_data_loader(test_dataset, batch_size=batch_size) - self._test_epoch() - self._finish() - - def _test_epoch(self): - if self.processed_records > 0: - self.test_data_loader = self.test_data_loader.skip(self.processed_records) - progbar = tqdm(initial=self.processed_records, total=self.total_steps, - unit="batch", position=0, desc="[Test]") - test_iter = iter(self.test_data_loader) - while True: - try: - decoded = self._test_function(test_iter) - except StopIteration: - break - except tf.errors.OutOfRangeError: - break - - decoded = [None if d is None else d.numpy() for d in decoded] - self._append_to_file(*decoded) - progbar.update(1) - - progbar.close() - - @tf.function - def _test_function(self, iterator): - batch = next(iterator) - return self._test_step(batch) - - @tf.function(experimental_relax_shapes=True) - def _test_step(self, batch): - """ - One testing step - Args: - batch: a step fed from test dataset - - Returns: - (file_paths, groundtruth, greedy, beamsearch, beamsearch_lm) each has shape [B] - """ - file_paths, features, input_length, labels, _, _, _ = batch - - labels = self.model.text_featurizer.iextract(labels) - input_length = get_reduced_length(input_length, self.model.time_reduction_factor) - greed_pred = self.model.recognize(features, input_length) - beam_pred = beam_lm_pred = None - if self.model.text_featurizer.decoder_config.beam_width > 0: - beam_pred = self.model.recognize_beam(features, input_length, lm=False) - if self.model.text_featurizer.decoder_config.lm_config: - beam_lm_pred = self.model.recognize_beam(features, input_length, lm=True) - - return file_paths, labels, greed_pred, beam_pred, beam_lm_pred - - # -------------------------------- UTILS ------------------------------------- - - def _finish(self): - tf.print("\n> Calculating evaluation metrics ...") - with open(self.output_file_path, "r", encoding="utf-8") as out: - lines = out.read().splitlines() - lines = lines[1:] # skip header - - for line in lines: - line = line.split("\t") - labels, greed_pred, beam_pred, beam_lm_pred = line[1], line[2], line[3], line[4] - labels = tf.convert_to_tensor([labels], dtype=tf.string) - greed_pred = tf.convert_to_tensor([greed_pred], dtype=tf.string) - beam_pred = tf.convert_to_tensor([beam_pred], dtype=tf.string) - beam_lm_pred = tf.convert_to_tensor([beam_lm_pred], dtype=tf.string) - # Update metrics - self.test_metrics["greed_wer"].update_state(greed_pred, labels) - self.test_metrics["greed_cer"].update_state(greed_pred, labels) - self.test_metrics["beam_wer"].update_state(beam_pred, labels) - self.test_metrics["beam_cer"].update_state(beam_pred, labels) - self.test_metrics["beam_lm_wer"].update_state(beam_lm_pred, labels) - self.test_metrics["beam_lm_cer"].update_state(beam_lm_pred, labels) - - tf.print("Test results:") - tf.print("G_WER =", self.test_metrics["greed_wer"].result()) - tf.print("G_CER =", self.test_metrics["greed_cer"].result()) - tf.print("B_WER =", self.test_metrics["beam_wer"].result()) - tf.print("B_CER =", self.test_metrics["beam_cer"].result()) - tf.print("BLM_WER =", self.test_metrics["beam_lm_wer"].result()) - tf.print("BLM_CER =", self.test_metrics["beam_lm_cer"].result()) - - def _append_to_file(self, - file_path: np.ndarray, - groundtruth: np.ndarray, - greedy: np.ndarray, - beamsearch: np.ndarray, - beamsearch_lm: np.ndarray): - file_path = bytes_to_string(file_path) - groundtruth = bytes_to_string(groundtruth) - greedy = bytes_to_string(greedy) - beamsearch = bytes_to_string(beamsearch) if beamsearch is not None else ["" for _ in file_path] - beamsearch_lm = bytes_to_string(beamsearch_lm) if beamsearch_lm is not None else ["" for _ in file_path] - with open(self.output_file_path, "a", encoding="utf-8") as out: - for i, path in enumerate(file_path): - line = f"{groundtruth[i]}\t{greedy[i]}\t{beamsearch[i]}\t{beamsearch_lm[i]}" - out.write(f"{path.strip()}\t{line}\n") - - # -------------------------------- END ------------------------------------- diff --git a/tensorflow_asr/runners/ctc_runners.py b/tensorflow_asr/runners/ctc_runners.py deleted file mode 100644 index ddc2d01dbd..0000000000 --- a/tensorflow_asr/runners/ctc_runners.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tensorflow as tf - -from ..configs.config import RunningConfig -from ..featurizers.text_featurizers import TextFeaturizer -from ..losses.ctc_losses import ctc_loss -from .base_runners import BaseTrainer -from ..optimizers.accumulation import GradientAccumulation -from ..utils.utils import get_reduced_length - - -class CTCTrainer(BaseTrainer): - """ Trainer for CTC Models """ - - def __init__(self, - text_featurizer: TextFeaturizer, - config: RunningConfig, - strategy: tf.distribute.Strategy = None): - self.text_featurizer = text_featurizer - super(CTCTrainer, self).__init__(config=config, strategy=strategy) - - def set_train_metrics(self): - self.train_metrics = { - "ctc_loss": tf.keras.metrics.Mean("train_ctc_loss", dtype=tf.float32) - } - - def set_eval_metrics(self): - self.eval_metrics = { - "ctc_loss": tf.keras.metrics.Mean("eval_ctc_loss", dtype=tf.float32), - } - - def save_model_weights(self): - with self.strategy.scope(): - self.model.save_weights(os.path.join(self.config.outdir, "latest.h5")) - - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, _, _ = batch - - with tf.GradientTape() as tape: - y_pred = self.model(features, training=True) - tape.watch(y_pred) - per_train_loss = ctc_loss( - y_true=labels, y_pred=y_pred, - input_length=get_reduced_length(input_length, self.model.time_reduction_factor), - label_length=label_length, - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss(per_train_loss, - global_batch_size=self.global_batch_size) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) - - self.train_metrics["ctc_loss"].update_state(per_train_loss) - - @tf.function(experimental_relax_shapes=True) - def _eval_step(self, batch): - _, features, input_length, labels, label_length, _, _ = batch - - logits = self.model(features, training=False) - - per_eval_loss = ctc_loss( - y_true=labels, y_pred=logits, - input_length=get_reduced_length(input_length, self.model.time_reduction_factor), - label_length=label_length, - blank=self.text_featurizer.blank - ) - - # Update metrics - self.eval_metrics["ctc_loss"].update_state(per_eval_loss) - - def compile(self, model: tf.keras.Model, - optimizer: any, - max_to_keep: int = 10): - with self.strategy.scope(): - self.model = model - self.optimizer = tf.keras.optimizers.get(optimizer) - self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer) - - -class CTCTrainerGA(CTCTrainer): - """ Trainer for CTC Models """ - - @tf.function - def _train_function(self, iterator): - for _ in range(self.config.accumulation_steps): - batch = next(iterator) - self.strategy.run(self._train_step, args=(batch,)) - self.strategy.run(self._apply_gradients, args=()) - - @tf.function - def _apply_gradients(self): - self.optimizer.apply_gradients( - zip(self.accumulation.gradients, self.model.trainable_variables)) - self.accumulation.reset() - - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, _, _ = batch - - with tf.GradientTape() as tape: - y_pred = self.model(features, training=True) - tape.watch(y_pred) - per_train_loss = ctc_loss( - y_true=labels, y_pred=y_pred, - input_length=get_reduced_length(input_length, self.model.time_reduction_factor), - label_length=label_length, - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss(per_train_loss, - global_batch_size=self.global_batch_size) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.accumulation.accumulate(gradients) - self.train_metrics["ctc_loss"].update_state(per_train_loss) - - def compile(self, model: tf.keras.Model, - optimizer: any, - max_to_keep: int = 10): - with self.strategy.scope(): - self.model = model - self.optimizer = tf.keras.optimizers.get(optimizer) - self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer) - self.accumulation = GradientAccumulation(self.model.trainable_variables) diff --git a/tensorflow_asr/runners/transducer_runners.py b/tensorflow_asr/runners/transducer_runners.py deleted file mode 100644 index d8e396be56..0000000000 --- a/tensorflow_asr/runners/transducer_runners.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tensorflow as tf - -from ..configs.config import RunningConfig -from ..optimizers.accumulation import GradientAccumulation -from .base_runners import BaseTrainer -from ..losses.rnnt_losses import rnnt_loss -from ..models.transducer import Transducer -from ..featurizers.text_featurizers import TextFeaturizer -from ..utils.utils import get_reduced_length - - -class TransducerTrainer(BaseTrainer): - def __init__(self, - config: RunningConfig, - text_featurizer: TextFeaturizer, - strategy: tf.distribute.Strategy = None): - self.text_featurizer = text_featurizer - super(TransducerTrainer, self).__init__(config, strategy=strategy) - - def set_train_metrics(self): - self.train_metrics = { - "transducer_loss": tf.keras.metrics.Mean("train_transducer_loss", dtype=tf.float32) - } - - def set_eval_metrics(self): - self.eval_metrics = { - "transducer_loss": tf.keras.metrics.Mean("eval_transducer_loss", dtype=tf.float32) - } - - def save_model_weights(self): - self.model.save_weights(os.path.join(self.config.outdir, "latest.h5")) - - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, prediction, prediction_length = batch - - with tf.GradientTape() as tape: - logits = self.model([features, input_length, prediction, prediction_length], training=True) - tape.watch(logits) - per_train_loss = rnnt_loss( - logits=logits, labels=labels, label_length=label_length, - logit_length=get_reduced_length(input_length, self.model.time_reduction_factor), - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss(per_train_loss, - global_batch_size=self.global_batch_size) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) - - self.train_metrics["transducer_loss"].update_state(per_train_loss) - - @tf.function(experimental_relax_shapes=True) - def _eval_step(self, batch): - _, features, input_length, labels, label_length, prediction, prediction_length = batch - - logits = self.model([features, input_length, prediction, prediction_length], training=False) - eval_loss = rnnt_loss( - logits=logits, labels=labels, label_length=label_length, - logit_length=get_reduced_length(input_length, self.model.time_reduction_factor), - blank=self.text_featurizer.blank - ) - - self.eval_metrics["transducer_loss"].update_state(eval_loss) - - def compile(self, - model: Transducer, - optimizer: any, - max_to_keep: int = 10): - with self.strategy.scope(): - self.model = model - self.optimizer = tf.keras.optimizers.get(optimizer) - self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer) - - -class TransducerTrainerGA(TransducerTrainer): - """ Transducer Trainer that uses Gradients Accumulation """ - - @tf.function - def _train_function(self, iterator): - for _ in range(self.config.accumulation_steps): - batch = next(iterator) - self.strategy.run(self._train_step, args=(batch,)) - self.strategy.run(self._apply_gradients, args=()) - - @tf.function - def _apply_gradients(self): - self.optimizer.apply_gradients( - zip(self.accumulation.gradients, self.model.trainable_variables)) - self.accumulation.reset() - - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, prediction, prediction_length = batch - - with tf.GradientTape() as tape: - logits = self.model([features, input_length, prediction, prediction_length], training=True) - tape.watch(logits) - per_train_loss = rnnt_loss( - logits=logits, labels=labels, label_length=label_length, - logit_length=get_reduced_length(input_length, self.model.time_reduction_factor), - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss( - per_train_loss, - global_batch_size=self.global_batch_size - ) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.accumulation.accumulate(gradients) - self.train_metrics["transducer_loss"].update_state(per_train_loss) - - def compile(self, - model: Transducer, - optimizer: any, - max_to_keep: int = 10): - with self.strategy.scope(): - self.model = model - self.optimizer = tf.keras.optimizers.get(optimizer) - self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer) - self.accumulation = GradientAccumulation(self.model.trainable_variables) From 9e39e5b5e601c208442f2103570c8f65b2c062ea Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Wed, 14 Apr 2021 00:13:56 +0700 Subject: [PATCH 04/13] :writing_hand: update models encoders --- .../README.md | 0 .../config.yml | 0 .../test_rnn_transducer.py} | 0 .../test_subword_rnn_transducer.py} | 0 .../tflite_rnn_transducer.py} | 0 .../tflite_subword_rnn_transducer.py} | 0 .../train_ga_rnn_transducer.py} | 0 .../train_ga_subword_rnn_transducer.py} | 0 .../train_keras_subword_rnn_transducer.py} | 0 .../train_rnn_transducer.py} | 0 .../train_subword_rnn_transducer.py} | 0 tensorflow_asr/models/encoders/__init__.py | 0 tensorflow_asr/models/encoders/conformer.py | 363 ++++++++++++++++++ tensorflow_asr/models/encoders/contextnet.py | 191 +++++++++ tensorflow_asr/models/transducer/conformer.py | 349 +---------------- .../models/transducer/contextnet.py | 177 +-------- 16 files changed, 557 insertions(+), 523 deletions(-) rename examples/{streaming_transducer => rnn_transducer}/README.md (100%) mode change 100755 => 100644 rename examples/{streaming_transducer => rnn_transducer}/config.yml (100%) mode change 100755 => 100644 rename examples/{streaming_transducer/test_streaming_transducer.py => rnn_transducer/test_rnn_transducer.py} (100%) mode change 100755 => 100644 rename examples/{streaming_transducer/test_subword_streaming_transducer.py => rnn_transducer/test_subword_rnn_transducer.py} (100%) mode change 100755 => 100644 rename examples/{streaming_transducer/tflite_streaming_transducer.py => rnn_transducer/tflite_rnn_transducer.py} (100%) rename examples/{streaming_transducer/tflite_subword_streaming_transducer.py => rnn_transducer/tflite_subword_rnn_transducer.py} (100%) rename examples/{streaming_transducer/train_ga_streaming_transducer.py => rnn_transducer/train_ga_rnn_transducer.py} (100%) rename examples/{streaming_transducer/train_ga_subword_streaming_transducer.py => rnn_transducer/train_ga_subword_rnn_transducer.py} (100%) rename examples/{streaming_transducer/train_keras_subword_streaming_transducer.py => rnn_transducer/train_keras_subword_rnn_transducer.py} (100%) rename examples/{streaming_transducer/train_streaming_transducer.py => rnn_transducer/train_rnn_transducer.py} (100%) rename examples/{streaming_transducer/train_subword_streaming_transducer.py => rnn_transducer/train_subword_rnn_transducer.py} (100%) create mode 100644 tensorflow_asr/models/encoders/__init__.py create mode 100644 tensorflow_asr/models/encoders/conformer.py create mode 100644 tensorflow_asr/models/encoders/contextnet.py diff --git a/examples/streaming_transducer/README.md b/examples/rnn_transducer/README.md old mode 100755 new mode 100644 similarity index 100% rename from examples/streaming_transducer/README.md rename to examples/rnn_transducer/README.md diff --git a/examples/streaming_transducer/config.yml b/examples/rnn_transducer/config.yml old mode 100755 new mode 100644 similarity index 100% rename from examples/streaming_transducer/config.yml rename to examples/rnn_transducer/config.yml diff --git a/examples/streaming_transducer/test_streaming_transducer.py b/examples/rnn_transducer/test_rnn_transducer.py old mode 100755 new mode 100644 similarity index 100% rename from examples/streaming_transducer/test_streaming_transducer.py rename to examples/rnn_transducer/test_rnn_transducer.py diff --git a/examples/streaming_transducer/test_subword_streaming_transducer.py b/examples/rnn_transducer/test_subword_rnn_transducer.py old mode 100755 new mode 100644 similarity index 100% rename from examples/streaming_transducer/test_subword_streaming_transducer.py rename to examples/rnn_transducer/test_subword_rnn_transducer.py diff --git a/examples/streaming_transducer/tflite_streaming_transducer.py b/examples/rnn_transducer/tflite_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/tflite_streaming_transducer.py rename to examples/rnn_transducer/tflite_rnn_transducer.py diff --git a/examples/streaming_transducer/tflite_subword_streaming_transducer.py b/examples/rnn_transducer/tflite_subword_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/tflite_subword_streaming_transducer.py rename to examples/rnn_transducer/tflite_subword_rnn_transducer.py diff --git a/examples/streaming_transducer/train_ga_streaming_transducer.py b/examples/rnn_transducer/train_ga_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/train_ga_streaming_transducer.py rename to examples/rnn_transducer/train_ga_rnn_transducer.py diff --git a/examples/streaming_transducer/train_ga_subword_streaming_transducer.py b/examples/rnn_transducer/train_ga_subword_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/train_ga_subword_streaming_transducer.py rename to examples/rnn_transducer/train_ga_subword_rnn_transducer.py diff --git a/examples/streaming_transducer/train_keras_subword_streaming_transducer.py b/examples/rnn_transducer/train_keras_subword_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/train_keras_subword_streaming_transducer.py rename to examples/rnn_transducer/train_keras_subword_rnn_transducer.py diff --git a/examples/streaming_transducer/train_streaming_transducer.py b/examples/rnn_transducer/train_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/train_streaming_transducer.py rename to examples/rnn_transducer/train_rnn_transducer.py diff --git a/examples/streaming_transducer/train_subword_streaming_transducer.py b/examples/rnn_transducer/train_subword_rnn_transducer.py similarity index 100% rename from examples/streaming_transducer/train_subword_streaming_transducer.py rename to examples/rnn_transducer/train_subword_rnn_transducer.py diff --git a/tensorflow_asr/models/encoders/__init__.py b/tensorflow_asr/models/encoders/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/encoders/conformer.py b/tensorflow_asr/models/encoders/conformer.py new file mode 100644 index 0000000000..de7b767fdd --- /dev/null +++ b/tensorflow_asr/models/encoders/conformer.py @@ -0,0 +1,363 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf + +from ..activations.glu import GLU +from ..layers.subsampling import VggSubsampling, Conv2dSubsampling +from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat +from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention +from ...utils import shape_util + +L2 = tf.keras.regularizers.l2(1e-6) + + +class FFModule(tf.keras.layers.Layer): + def __init__(self, + input_dim, + dropout=0.0, + fc_factor=0.5, + kernel_regularizer=L2, + bias_regularizer=L2, + name="ff_module", + **kwargs): + super(FFModule, self).__init__(name=name, **kwargs) + self.fc_factor = fc_factor + self.ln = tf.keras.layers.LayerNormalization( + name=f"{name}_ln", + gamma_regularizer=kernel_regularizer, + beta_regularizer=bias_regularizer + ) + self.ffn1 = tf.keras.layers.Dense( + 4 * input_dim, name=f"{name}_dense_1", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation") + self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1") + self.ffn2 = tf.keras.layers.Dense( + input_dim, name=f"{name}_dense_2", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2") + self.res_add = tf.keras.layers.Add(name=f"{name}_add") + + def call(self, inputs, training=False, **kwargs): + outputs = self.ln(inputs, training=training) + outputs = self.ffn1(outputs, training=training) + outputs = self.swish(outputs) + outputs = self.do1(outputs, training=training) + outputs = self.ffn2(outputs, training=training) + outputs = self.do2(outputs, training=training) + outputs = self.res_add([inputs, self.fc_factor * outputs]) + return outputs + + def get_config(self): + conf = super(FFModule, self).get_config() + conf.update({"fc_factor": self.fc_factor}) + conf.update(self.ln.get_config()) + conf.update(self.ffn1.get_config()) + conf.update(self.swish.get_config()) + conf.update(self.do1.get_config()) + conf.update(self.ffn2.get_config()) + conf.update(self.do2.get_config()) + conf.update(self.res_add.get_config()) + return conf + + +class MHSAModule(tf.keras.layers.Layer): + def __init__(self, + head_size, + num_heads, + dropout=0.0, + mha_type="relmha", + kernel_regularizer=L2, + bias_regularizer=L2, + name="mhsa_module", + **kwargs): + super(MHSAModule, self).__init__(name=name, **kwargs) + self.ln = tf.keras.layers.LayerNormalization( + name=f"{name}_ln", + gamma_regularizer=kernel_regularizer, + beta_regularizer=bias_regularizer + ) + if mha_type == "relmha": + self.mha = RelPositionMultiHeadAttention( + name=f"{name}_mhsa", + head_size=head_size, num_heads=num_heads, + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + elif mha_type == "mha": + self.mha = MultiHeadAttention( + name=f"{name}_mhsa", + head_size=head_size, num_heads=num_heads, + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + else: + raise ValueError("mha_type must be either 'mha' or 'relmha'") + self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") + self.res_add = tf.keras.layers.Add(name=f"{name}_add") + self.mha_type = mha_type + + def call(self, inputs, training=False, mask=None, **kwargs): + inputs, pos = inputs # pos is positional encoding + outputs = self.ln(inputs, training=training) + if self.mha_type == "relmha": + outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask) + else: + outputs = outputs + pos + outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask) + outputs = self.do(outputs, training=training) + outputs = self.res_add([inputs, outputs]) + return outputs + + def get_config(self): + conf = super(MHSAModule, self).get_config() + conf.update({"mha_type": self.mha_type}) + conf.update(self.ln.get_config()) + conf.update(self.mha.get_config()) + conf.update(self.do.get_config()) + conf.update(self.res_add.get_config()) + return conf + + +class ConvModule(tf.keras.layers.Layer): + def __init__(self, + input_dim, + kernel_size=32, + dropout=0.0, + depth_multiplier=1, + kernel_regularizer=L2, + bias_regularizer=L2, + name="conv_module", + **kwargs): + super(ConvModule, self).__init__(name=name, **kwargs) + self.ln = tf.keras.layers.LayerNormalization() + self.pw_conv_1 = tf.keras.layers.Conv2D( + filters=2 * input_dim, kernel_size=1, strides=1, + padding="valid", name=f"{name}_pw_conv_1", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.glu = GLU(name=f"{name}_glu") + self.dw_conv = tf.keras.layers.DepthwiseConv2D( + kernel_size=(kernel_size, 1), strides=1, + padding="same", name=f"{name}_dw_conv", + depth_multiplier=depth_multiplier, + depthwise_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.bn = tf.keras.layers.BatchNormalization( + name=f"{name}_bn", + gamma_regularizer=kernel_regularizer, + beta_regularizer=bias_regularizer + ) + self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation") + self.pw_conv_2 = tf.keras.layers.Conv2D( + filters=input_dim, kernel_size=1, strides=1, + padding="valid", name=f"{name}_pw_conv_2", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") + self.res_add = tf.keras.layers.Add(name=f"{name}_add") + + def call(self, inputs, training=False, **kwargs): + outputs = self.ln(inputs, training=training) + B, T, E = shape_util.shape_list(outputs) + outputs = tf.reshape(outputs, [B, T, 1, E]) + outputs = self.pw_conv_1(outputs, training=training) + outputs = self.glu(outputs) + outputs = self.dw_conv(outputs, training=training) + outputs = self.bn(outputs, training=training) + outputs = self.swish(outputs) + outputs = self.pw_conv_2(outputs, training=training) + outputs = tf.reshape(outputs, [B, T, E]) + outputs = self.do(outputs, training=training) + outputs = self.res_add([inputs, outputs]) + return outputs + + def get_config(self): + conf = super(ConvModule, self).get_config() + conf.update(self.ln.get_config()) + conf.update(self.pw_conv_1.get_config()) + conf.update(self.glu.get_config()) + conf.update(self.dw_conv.get_config()) + conf.update(self.bn.get_config()) + conf.update(self.swish.get_config()) + conf.update(self.pw_conv_2.get_config()) + conf.update(self.do.get_config()) + conf.update(self.res_add.get_config()) + return conf + + +class ConformerBlock(tf.keras.layers.Layer): + def __init__(self, + input_dim, + dropout=0.0, + fc_factor=0.5, + head_size=36, + num_heads=4, + mha_type="relmha", + kernel_size=32, + depth_multiplier=1, + kernel_regularizer=L2, + bias_regularizer=L2, + name="conformer_block", + **kwargs): + super(ConformerBlock, self).__init__(name=name, **kwargs) + self.ffm1 = FFModule( + input_dim=input_dim, dropout=dropout, + fc_factor=fc_factor, name=f"{name}_ff_module_1", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.mhsam = MHSAModule( + mha_type=mha_type, + head_size=head_size, num_heads=num_heads, + dropout=dropout, name=f"{name}_mhsa_module", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.convm = ConvModule( + input_dim=input_dim, kernel_size=kernel_size, + dropout=dropout, name=f"{name}_conv_module", + depth_multiplier=depth_multiplier, + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.ffm2 = FFModule( + input_dim=input_dim, dropout=dropout, + fc_factor=fc_factor, name=f"{name}_ff_module_2", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.ln = tf.keras.layers.LayerNormalization( + name=f"{name}_ln", + gamma_regularizer=kernel_regularizer, + beta_regularizer=kernel_regularizer + ) + + def call(self, inputs, training=False, mask=None, **kwargs): + inputs, pos = inputs # pos is positional encoding + outputs = self.ffm1(inputs, training=training, **kwargs) + outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs) + outputs = self.convm(outputs, training=training, **kwargs) + outputs = self.ffm2(outputs, training=training, **kwargs) + outputs = self.ln(outputs, training=training) + return outputs + + def get_config(self): + conf = super(ConformerBlock, self).get_config() + conf.update(self.ffm1.get_config()) + conf.update(self.mhsam.get_config()) + conf.update(self.convm.get_config()) + conf.update(self.ffm2.get_config()) + conf.update(self.ln.get_config()) + return conf + + +class ConformerEncoder(tf.keras.Model): + def __init__(self, + subsampling, + positional_encoding="sinusoid", + dmodel=144, + num_blocks=16, + mha_type="relmha", + head_size=36, + num_heads=4, + kernel_size=32, + depth_multiplier=1, + fc_factor=0.5, + dropout=0.0, + kernel_regularizer=L2, + bias_regularizer=L2, + name="conformer_encoder", + **kwargs): + super(ConformerEncoder, self).__init__(name=name, **kwargs) + + subsampling_name = subsampling.pop("type", "conv2d") + if subsampling_name == "vgg": + subsampling_class = VggSubsampling + elif subsampling_name == "conv2d": + subsampling_class = Conv2dSubsampling + else: + raise ValueError("subsampling must be either 'conv2d' or 'vgg'") + + self.conv_subsampling = subsampling_class( + **subsampling, name=f"{name}_subsampling", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + + if positional_encoding == "sinusoid": + self.pe = PositionalEncoding(name=f"{name}_pe") + elif positional_encoding == "sinusoid_v2": + self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe") + elif positional_encoding == "sinusoid_concat": + self.pe = PositionalEncodingConcat(name=f"{name}_pe") + elif positional_encoding == "sinusoid_concat_v2": + self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe") + elif positional_encoding == "subsampling": + self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe") + else: + raise ValueError("positional_encoding must be either 'sinusoid', \ + 'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'") + + self.linear = tf.keras.layers.Dense( + dmodel, name=f"{name}_linear", + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer + ) + self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") + + self.conformer_blocks = [] + for i in range(num_blocks): + conformer_block = ConformerBlock( + input_dim=dmodel, + dropout=dropout, + fc_factor=fc_factor, + head_size=head_size, + num_heads=num_heads, + mha_type=mha_type, + kernel_size=kernel_size, + depth_multiplier=depth_multiplier, + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer, + name=f"{name}_block_{i}" + ) + self.conformer_blocks.append(conformer_block) + + def call(self, inputs, training=False, mask=None, **kwargs): + # input with shape [B, T, V1, V2] + outputs = self.conv_subsampling(inputs, training=training) + outputs = self.linear(outputs, training=training) + pe = self.pe(outputs) + outputs = self.do(outputs, training=training) + for cblock in self.conformer_blocks: + outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs) + return outputs + + def get_config(self): + conf = super(ConformerEncoder, self).get_config() + conf.update(self.conv_subsampling.get_config()) + conf.update(self.linear.get_config()) + conf.update(self.do.get_config()) + conf.update(self.pe.get_config()) + for cblock in self.conformer_blocks: + conf.update(cblock.get_config()) + return conf diff --git a/tensorflow_asr/models/encoders/contextnet.py b/tensorflow_asr/models/encoders/contextnet.py new file mode 100644 index 0000000000..5fd9924972 --- /dev/null +++ b/tensorflow_asr/models/encoders/contextnet.py @@ -0,0 +1,191 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Ref: https://github.com/iankur/ContextNet """ + +from typing import List +import tensorflow as tf +from ...utils import math_util + +L2 = tf.keras.regularizers.l2(1e-6) + + +def get_activation(activation: str = "silu"): + activation = activation.lower() + if activation in ["silu", "swish"]: return tf.nn.swish + elif activation == "relu": return tf.nn.relu + elif activation == "linear": return tf.keras.activations.linear + else: raise ValueError("activation must be either 'silu', 'swish', 'relu' or 'linear'") + + +class Reshape(tf.keras.layers.Layer): + def call(self, inputs): return math_util.merge_two_last_dims(inputs) + + +class ConvModule(tf.keras.layers.Layer): + def __init__(self, + kernel_size: int = 3, + strides: int = 1, + filters: int = 256, + activation: str = "silu", + kernel_regularizer = None, + bias_regularizer = None, + **kwargs): + super(ConvModule, self).__init__(**kwargs) + self.strides = strides + self.conv = tf.keras.layers.SeparableConv1D( + filters=filters, kernel_size=kernel_size, strides=strides, padding="same", + depthwise_regularizer=kernel_regularizer, pointwise_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer, name=f"{self.name}_conv" + ) + self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn") + self.activation = get_activation(activation) + + def call(self, inputs, training=False, **kwargs): + outputs = self.conv(inputs, training=training) + outputs = self.bn(outputs, training=training) + outputs = self.activation(outputs) + return outputs + + +class SEModule(tf.keras.layers.Layer): + def __init__(self, + kernel_size: int = 3, + strides: int = 1, + filters: int = 256, + activation: str = "silu", + kernel_regularizer = None, + bias_regularizer = None, + **kwargs): + super(SEModule, self).__init__(**kwargs) + self.conv = ConvModule( + kernel_size=kernel_size, strides=strides, + filters=filters, activation=activation, + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_conv_module" + ) + self.activation = get_activation(activation) + self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1") + self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2") + + def call(self, inputs, training=False, **kwargs): + features, input_length = inputs + outputs = self.conv(features, training=training) + + se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1)) + se = self.fc1(se, training=training) + se = self.activation(se) + se = self.fc2(se, training=training) + se = self.activation(se) + se = tf.nn.sigmoid(se) + se = tf.expand_dims(se, axis=1) + + outputs = tf.multiply(outputs, se) + return outputs + + +class ConvBlock(tf.keras.layers.Layer): + def __init__(self, + nlayers: int = 3, + kernel_size: int = 3, + filters: int = 256, + strides: int = 1, + residual: bool = True, + activation: str = 'silu', + alpha: float = 1.0, + kernel_regularizer = None, + bias_regularizer = None, + **kwargs): + super(ConvBlock, self).__init__(**kwargs) + + self.dmodel = filters + self.time_reduction_factor = strides + filters = int(filters * alpha) + + self.convs = [] + for i in range(nlayers - 1): + self.convs.append( + ConvModule( + kernel_size=kernel_size, strides=1, + filters=filters, activation=activation, + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_conv_module_{i}" + ) + ) + + self.last_conv = ConvModule( + kernel_size=kernel_size, strides=strides, + filters=filters, activation=activation, + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_conv_module_{nlayers - 1}" + ) + + self.se = SEModule( + kernel_size=kernel_size, strides=1, filters=filters, activation=activation, + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_se" + ) + + self.residual = None + if residual: + self.residual = ConvModule( + kernel_size=kernel_size, strides=strides, + filters=filters, activation="linear", + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_residual" + ) + + self.activation = get_activation(activation) + + def call(self, inputs, training=False, **kwargs): + features, input_length = inputs + outputs = features + for conv in self.convs: + outputs = conv(outputs, training=training) + outputs = self.last_conv(outputs, training=training) + input_length = math_util.get_reduced_length(input_length, self.last_conv.strides) + outputs = self.se([outputs, input_length], training=training) + if self.residual is not None: + res = self.residual(features, training=training) + outputs = tf.add(outputs, res) + outputs = self.activation(outputs) + return outputs, input_length + + +class ContextNetEncoder(tf.keras.Model): + def __init__(self, + blocks: List[dict] = [], + alpha: float = 1.0, + kernel_regularizer = None, + bias_regularizer = None, + **kwargs): + super(ContextNetEncoder, self).__init__(**kwargs) + + self.reshape = Reshape(name=f"{self.name}_reshape") + + self.blocks = [] + for i, config in enumerate(blocks): + self.blocks.append( + ConvBlock( + **config, alpha=alpha, + kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, + name=f"{self.name}_block_{i}" + ) + ) + + def call(self, inputs, training=False, **kwargs): + outputs, input_length = inputs + outputs = self.reshape(outputs) + for block in self.blocks: + outputs, input_length = block([outputs, input_length], training=training) + return outputs diff --git a/tensorflow_asr/models/transducer/conformer.py b/tensorflow_asr/models/transducer/conformer.py index f66197d972..b5d151e266 100644 --- a/tensorflow_asr/models/transducer/conformer.py +++ b/tensorflow_asr/models/transducer/conformer.py @@ -12,356 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf -from ..activations.glu import GLU +from ..encoders.conformer import ConformerEncoder, L2 from .transducer import Transducer -from ..layers.subsampling import VggSubsampling, Conv2dSubsampling -from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat -from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention -from ...utils import shape_util - -L2 = tf.keras.regularizers.l2(1e-6) - - -class FFModule(tf.keras.layers.Layer): - def __init__(self, - input_dim, - dropout=0.0, - fc_factor=0.5, - kernel_regularizer=L2, - bias_regularizer=L2, - name="ff_module", - **kwargs): - super(FFModule, self).__init__(name=name, **kwargs) - self.fc_factor = fc_factor - self.ln = tf.keras.layers.LayerNormalization( - name=f"{name}_ln", - gamma_regularizer=kernel_regularizer, - beta_regularizer=bias_regularizer - ) - self.ffn1 = tf.keras.layers.Dense( - 4 * input_dim, name=f"{name}_dense_1", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation") - self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1") - self.ffn2 = tf.keras.layers.Dense( - input_dim, name=f"{name}_dense_2", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2") - self.res_add = tf.keras.layers.Add(name=f"{name}_add") - - def call(self, inputs, training=False, **kwargs): - outputs = self.ln(inputs, training=training) - outputs = self.ffn1(outputs, training=training) - outputs = self.swish(outputs) - outputs = self.do1(outputs, training=training) - outputs = self.ffn2(outputs, training=training) - outputs = self.do2(outputs, training=training) - outputs = self.res_add([inputs, self.fc_factor * outputs]) - return outputs - - def get_config(self): - conf = super(FFModule, self).get_config() - conf.update({"fc_factor": self.fc_factor}) - conf.update(self.ln.get_config()) - conf.update(self.ffn1.get_config()) - conf.update(self.swish.get_config()) - conf.update(self.do1.get_config()) - conf.update(self.ffn2.get_config()) - conf.update(self.do2.get_config()) - conf.update(self.res_add.get_config()) - return conf - - -class MHSAModule(tf.keras.layers.Layer): - def __init__(self, - head_size, - num_heads, - dropout=0.0, - mha_type="relmha", - kernel_regularizer=L2, - bias_regularizer=L2, - name="mhsa_module", - **kwargs): - super(MHSAModule, self).__init__(name=name, **kwargs) - self.ln = tf.keras.layers.LayerNormalization( - name=f"{name}_ln", - gamma_regularizer=kernel_regularizer, - beta_regularizer=bias_regularizer - ) - if mha_type == "relmha": - self.mha = RelPositionMultiHeadAttention( - name=f"{name}_mhsa", - head_size=head_size, num_heads=num_heads, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - elif mha_type == "mha": - self.mha = MultiHeadAttention( - name=f"{name}_mhsa", - head_size=head_size, num_heads=num_heads, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - else: - raise ValueError("mha_type must be either 'mha' or 'relmha'") - self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") - self.res_add = tf.keras.layers.Add(name=f"{name}_add") - self.mha_type = mha_type - - def call(self, inputs, training=False, mask=None, **kwargs): - inputs, pos = inputs # pos is positional encoding - outputs = self.ln(inputs, training=training) - if self.mha_type == "relmha": - outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask) - else: - outputs = outputs + pos - outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask) - outputs = self.do(outputs, training=training) - outputs = self.res_add([inputs, outputs]) - return outputs - - def get_config(self): - conf = super(MHSAModule, self).get_config() - conf.update({"mha_type": self.mha_type}) - conf.update(self.ln.get_config()) - conf.update(self.mha.get_config()) - conf.update(self.do.get_config()) - conf.update(self.res_add.get_config()) - return conf - - -class ConvModule(tf.keras.layers.Layer): - def __init__(self, - input_dim, - kernel_size=32, - dropout=0.0, - depth_multiplier=1, - kernel_regularizer=L2, - bias_regularizer=L2, - name="conv_module", - **kwargs): - super(ConvModule, self).__init__(name=name, **kwargs) - self.ln = tf.keras.layers.LayerNormalization() - self.pw_conv_1 = tf.keras.layers.Conv2D( - filters=2 * input_dim, kernel_size=1, strides=1, - padding="valid", name=f"{name}_pw_conv_1", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.glu = GLU(name=f"{name}_glu") - self.dw_conv = tf.keras.layers.DepthwiseConv2D( - kernel_size=(kernel_size, 1), strides=1, - padding="same", name=f"{name}_dw_conv", - depth_multiplier=depth_multiplier, - depthwise_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.bn = tf.keras.layers.BatchNormalization( - name=f"{name}_bn", - gamma_regularizer=kernel_regularizer, - beta_regularizer=bias_regularizer - ) - self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation") - self.pw_conv_2 = tf.keras.layers.Conv2D( - filters=input_dim, kernel_size=1, strides=1, - padding="valid", name=f"{name}_pw_conv_2", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") - self.res_add = tf.keras.layers.Add(name=f"{name}_add") - - def call(self, inputs, training=False, **kwargs): - outputs = self.ln(inputs, training=training) - B, T, E = shape_util.shape_list(outputs) - outputs = tf.reshape(outputs, [B, T, 1, E]) - outputs = self.pw_conv_1(outputs, training=training) - outputs = self.glu(outputs) - outputs = self.dw_conv(outputs, training=training) - outputs = self.bn(outputs, training=training) - outputs = self.swish(outputs) - outputs = self.pw_conv_2(outputs, training=training) - outputs = tf.reshape(outputs, [B, T, E]) - outputs = self.do(outputs, training=training) - outputs = self.res_add([inputs, outputs]) - return outputs - - def get_config(self): - conf = super(ConvModule, self).get_config() - conf.update(self.ln.get_config()) - conf.update(self.pw_conv_1.get_config()) - conf.update(self.glu.get_config()) - conf.update(self.dw_conv.get_config()) - conf.update(self.bn.get_config()) - conf.update(self.swish.get_config()) - conf.update(self.pw_conv_2.get_config()) - conf.update(self.do.get_config()) - conf.update(self.res_add.get_config()) - return conf - - -class ConformerBlock(tf.keras.layers.Layer): - def __init__(self, - input_dim, - dropout=0.0, - fc_factor=0.5, - head_size=36, - num_heads=4, - mha_type="relmha", - kernel_size=32, - depth_multiplier=1, - kernel_regularizer=L2, - bias_regularizer=L2, - name="conformer_block", - **kwargs): - super(ConformerBlock, self).__init__(name=name, **kwargs) - self.ffm1 = FFModule( - input_dim=input_dim, dropout=dropout, - fc_factor=fc_factor, name=f"{name}_ff_module_1", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.mhsam = MHSAModule( - mha_type=mha_type, - head_size=head_size, num_heads=num_heads, - dropout=dropout, name=f"{name}_mhsa_module", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.convm = ConvModule( - input_dim=input_dim, kernel_size=kernel_size, - dropout=dropout, name=f"{name}_conv_module", - depth_multiplier=depth_multiplier, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.ffm2 = FFModule( - input_dim=input_dim, dropout=dropout, - fc_factor=fc_factor, name=f"{name}_ff_module_2", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.ln = tf.keras.layers.LayerNormalization( - name=f"{name}_ln", - gamma_regularizer=kernel_regularizer, - beta_regularizer=kernel_regularizer - ) - - def call(self, inputs, training=False, mask=None, **kwargs): - inputs, pos = inputs # pos is positional encoding - outputs = self.ffm1(inputs, training=training, **kwargs) - outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs) - outputs = self.convm(outputs, training=training, **kwargs) - outputs = self.ffm2(outputs, training=training, **kwargs) - outputs = self.ln(outputs, training=training) - return outputs - - def get_config(self): - conf = super(ConformerBlock, self).get_config() - conf.update(self.ffm1.get_config()) - conf.update(self.mhsam.get_config()) - conf.update(self.convm.get_config()) - conf.update(self.ffm2.get_config()) - conf.update(self.ln.get_config()) - return conf - - -class ConformerEncoder(tf.keras.Model): - def __init__(self, - subsampling, - positional_encoding="sinusoid", - dmodel=144, - num_blocks=16, - mha_type="relmha", - head_size=36, - num_heads=4, - kernel_size=32, - depth_multiplier=1, - fc_factor=0.5, - dropout=0.0, - kernel_regularizer=L2, - bias_regularizer=L2, - name="conformer_encoder", - **kwargs): - super(ConformerEncoder, self).__init__(name=name, **kwargs) - - subsampling_name = subsampling.pop("type", "conv2d") - if subsampling_name == "vgg": - subsampling_class = VggSubsampling - elif subsampling_name == "conv2d": - subsampling_class = Conv2dSubsampling - else: - raise ValueError("subsampling must be either 'conv2d' or 'vgg'") - - self.conv_subsampling = subsampling_class( - **subsampling, name=f"{name}_subsampling", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - - if positional_encoding == "sinusoid": - self.pe = PositionalEncoding(name=f"{name}_pe") - elif positional_encoding == "sinusoid_v2": - self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe") - elif positional_encoding == "sinusoid_concat": - self.pe = PositionalEncodingConcat(name=f"{name}_pe") - elif positional_encoding == "sinusoid_concat_v2": - self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe") - elif positional_encoding == "subsampling": - self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe") - else: - raise ValueError("positional_encoding must be either 'sinusoid', \ - 'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'") - - self.linear = tf.keras.layers.Dense( - dmodel, name=f"{name}_linear", - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer - ) - self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout") - - self.conformer_blocks = [] - for i in range(num_blocks): - conformer_block = ConformerBlock( - input_dim=dmodel, - dropout=dropout, - fc_factor=fc_factor, - head_size=head_size, - num_heads=num_heads, - mha_type=mha_type, - kernel_size=kernel_size, - depth_multiplier=depth_multiplier, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, - name=f"{name}_block_{i}" - ) - self.conformer_blocks.append(conformer_block) - - def call(self, inputs, training=False, mask=None, **kwargs): - # input with shape [B, T, V1, V2] - outputs = self.conv_subsampling(inputs, training=training) - outputs = self.linear(outputs, training=training) - pe = self.pe(outputs) - outputs = self.do(outputs, training=training) - for cblock in self.conformer_blocks: - outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs) - return outputs - - def get_config(self): - conf = super(ConformerEncoder, self).get_config() - conf.update(self.conv_subsampling.get_config()) - conf.update(self.linear.get_config()) - conf.update(self.do.get_config()) - conf.update(self.pe.get_config()) - for cblock in self.conformer_blocks: - conf.update(cblock.get_config()) - return conf class Conformer(Transducer): diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py index dac9e9050d..2f47f100ee 100644 --- a/tensorflow_asr/models/transducer/contextnet.py +++ b/tensorflow_asr/models/transducer/contextnet.py @@ -11,185 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Ref: https://github.com/iankur/ContextNet """ from typing import List import tensorflow as tf -from .transducer import Transducer -from ...utils import math_util - -L2 = tf.keras.regularizers.l2(1e-6) - - -def get_activation(activation: str = "silu"): - activation = activation.lower() - if activation in ["silu", "swish"]: return tf.nn.swish - elif activation == "relu": return tf.nn.relu - elif activation == "linear": return tf.keras.activations.linear - else: raise ValueError("activation must be either 'silu', 'swish', 'relu' or 'linear'") - - -class Reshape(tf.keras.layers.Layer): - def call(self, inputs): return math_util.merge_two_last_dims(inputs) - - -class ConvModule(tf.keras.layers.Layer): - def __init__(self, - kernel_size: int = 3, - strides: int = 1, - filters: int = 256, - activation: str = "silu", - kernel_regularizer = None, - bias_regularizer = None, - **kwargs): - super(ConvModule, self).__init__(**kwargs) - self.strides = strides - self.conv = tf.keras.layers.SeparableConv1D( - filters=filters, kernel_size=kernel_size, strides=strides, padding="same", - depthwise_regularizer=kernel_regularizer, pointwise_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer, name=f"{self.name}_conv" - ) - self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn") - self.activation = get_activation(activation) - - def call(self, inputs, training=False, **kwargs): - outputs = self.conv(inputs, training=training) - outputs = self.bn(outputs, training=training) - outputs = self.activation(outputs) - return outputs - - -class SEModule(tf.keras.layers.Layer): - def __init__(self, - kernel_size: int = 3, - strides: int = 1, - filters: int = 256, - activation: str = "silu", - kernel_regularizer = None, - bias_regularizer = None, - **kwargs): - super(SEModule, self).__init__(**kwargs) - self.conv = ConvModule( - kernel_size=kernel_size, strides=strides, - filters=filters, activation=activation, - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_conv_module" - ) - self.activation = get_activation(activation) - self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1") - self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2") - - def call(self, inputs, training=False, **kwargs): - features, input_length = inputs - outputs = self.conv(features, training=training) - - se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1)) - se = self.fc1(se, training=training) - se = self.activation(se) - se = self.fc2(se, training=training) - se = self.activation(se) - se = tf.nn.sigmoid(se) - se = tf.expand_dims(se, axis=1) - - outputs = tf.multiply(outputs, se) - return outputs - -class ConvBlock(tf.keras.layers.Layer): - def __init__(self, - nlayers: int = 3, - kernel_size: int = 3, - filters: int = 256, - strides: int = 1, - residual: bool = True, - activation: str = 'silu', - alpha: float = 1.0, - kernel_regularizer = None, - bias_regularizer = None, - **kwargs): - super(ConvBlock, self).__init__(**kwargs) - - self.dmodel = filters - self.time_reduction_factor = strides - filters = int(filters * alpha) - - self.convs = [] - for i in range(nlayers - 1): - self.convs.append( - ConvModule( - kernel_size=kernel_size, strides=1, - filters=filters, activation=activation, - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_conv_module_{i}" - ) - ) - - self.last_conv = ConvModule( - kernel_size=kernel_size, strides=strides, - filters=filters, activation=activation, - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_conv_module_{nlayers - 1}" - ) - - self.se = SEModule( - kernel_size=kernel_size, strides=1, filters=filters, activation=activation, - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_se" - ) - - self.residual = None - if residual: - self.residual = ConvModule( - kernel_size=kernel_size, strides=strides, - filters=filters, activation="linear", - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_residual" - ) - - self.activation = get_activation(activation) - - def call(self, inputs, training=False, **kwargs): - features, input_length = inputs - outputs = features - for conv in self.convs: - outputs = conv(outputs, training=training) - outputs = self.last_conv(outputs, training=training) - input_length = math_util.get_reduced_length(input_length, self.last_conv.strides) - outputs = self.se([outputs, input_length], training=training) - if self.residual is not None: - res = self.residual(features, training=training) - outputs = tf.add(outputs, res) - outputs = self.activation(outputs) - return outputs, input_length - - -class ContextNetEncoder(tf.keras.Model): - def __init__(self, - blocks: List[dict] = [], - alpha: float = 1.0, - kernel_regularizer = None, - bias_regularizer = None, - **kwargs): - super(ContextNetEncoder, self).__init__(**kwargs) - - self.reshape = Reshape(name=f"{self.name}_reshape") - - self.blocks = [] - for i, config in enumerate(blocks): - self.blocks.append( - ConvBlock( - **config, alpha=alpha, - kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_block_{i}" - ) - ) - - def call(self, inputs, training=False, **kwargs): - outputs, input_length = inputs - outputs = self.reshape(outputs) - for block in self.blocks: - outputs, input_length = block([outputs, input_length], training=training) - return outputs +from ..encoders.contextnet import ContextNetEncoder, L2 +from .transducer import Transducer class ContextNet(Transducer): From 29a285911d544f972e2a658997b5b999fa3d6a0f Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Wed, 14 Apr 2021 00:22:27 +0700 Subject: [PATCH 05/13] :writing_hand: update augmentations --- examples/conformer/config.yml | 2 +- examples/contextnet/config.yml | 2 +- examples/rnn_transducer/config.yml | 2 +- tensorflow_asr/augmentations/README.md | 2 +- tensorflow_asr/augmentations/augmentation.py | 8 ++++---- tests/conformer/config.yml | 2 +- tests/contextnet/config.yml | 2 +- tests/streaming_transducer/config.yml | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml index d77dfa1f24..79bef5276b 100755 --- a/examples/conformer/config.yml +++ b/examples/conformer/config.yml @@ -66,7 +66,7 @@ learning_config: train_dataset_config: use_tf: True augmentation_config: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml index 5127dd1de6..24b2f17e9d 100644 --- a/examples/contextnet/config.yml +++ b/examples/contextnet/config.yml @@ -198,7 +198,7 @@ learning_config: train_dataset_config: use_tf: True augmentation_config: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 diff --git a/examples/rnn_transducer/config.yml b/examples/rnn_transducer/config.yml index 47b0e41ae9..8acfee4f92 100644 --- a/examples/rnn_transducer/config.yml +++ b/examples/rnn_transducer/config.yml @@ -55,7 +55,7 @@ learning_config: train_dataset_config: use_tf: True augmentation_config: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 diff --git a/tensorflow_asr/augmentations/README.md b/tensorflow_asr/augmentations/README.md index 6dc714a967..4723c7659d 100644 --- a/tensorflow_asr/augmentations/README.md +++ b/tensorflow_asr/augmentations/README.md @@ -7,7 +7,7 @@ Augmentations use `nlpaug`, for futher information, see [nlpaug.readthedocs.io]( ```yaml augmentations: before: ... - after: ... + feature_augment: ... ``` Where `before` and `after` are augmentation methods to use before and after features extraction. diff --git a/tensorflow_asr/augmentations/augmentation.py b/tensorflow_asr/augmentations/augmentation.py index 314a6488b6..4ffa03df29 100644 --- a/tensorflow_asr/augmentations/augmentation.py +++ b/tensorflow_asr/augmentations/augmentation.py @@ -27,8 +27,8 @@ class Augmentation: def __init__(self, config: dict = None): if not config: config = {} self.prob = float(config.pop("prob", 0.5)) - self.before = self.parse(config.pop("before", {})) - self.after = self.parse(config.pop("after", {})) + self.signal_augmentations = self.parse(config.pop("signal_augment", {})) + self.feature_augmentations = self.parse(config.pop("feature_augment", {})) def _augment(self, inputs, augmentations): outputs = inputs @@ -39,11 +39,11 @@ def _augment(self, inputs, augmentations): @tf.function def signal_augment(self, inputs): - return self._augment(inputs, self.before) + return self._augment(inputs, self.signal_augmentations) @tf.function def feature_augment(self, inputs): - return self._augment(inputs, self.after) + return self._augment(inputs, self.feature_augmentations) @staticmethod def parse(config: dict) -> list: diff --git a/tests/conformer/config.yml b/tests/conformer/config.yml index 3f4bd41415..5e94f0cd41 100644 --- a/tests/conformer/config.yml +++ b/tests/conformer/config.yml @@ -62,7 +62,7 @@ model_config: learning_config: augmentations: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 diff --git a/tests/contextnet/config.yml b/tests/contextnet/config.yml index 7b5d8d2333..a510b710ad 100644 --- a/tests/contextnet/config.yml +++ b/tests/contextnet/config.yml @@ -196,7 +196,7 @@ model_config: learning_config: augmentations: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 diff --git a/tests/streaming_transducer/config.yml b/tests/streaming_transducer/config.yml index ff2c6a4ed5..4f8d3e52d9 100644 --- a/tests/streaming_transducer/config.yml +++ b/tests/streaming_transducer/config.yml @@ -53,7 +53,7 @@ model_config: learning_config: augmentations: - after: + feature_augment: time_masking: num_masks: 10 mask_factor: 100 From ccfd924b6fab544258ed6c270c53686f9f8b9298 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Wed, 14 Apr 2021 00:41:38 +0700 Subject: [PATCH 06/13] :writing_hand: update examples scripts --- examples/conformer/masking/README.md | 5 - examples/conformer/masking/masking.py | 32 ---- .../masking/train_ga_masking_conformer.py | 131 ---------------- .../train_ga_masking_subword_conformer.py | 147 ------------------ .../masking/train_masking_conformer.py | 128 --------------- .../train_masking_subword_conformer.py | 143 ----------------- examples/conformer/masking/trainer.py | 55 ------- .../conformer/save_conformer_from_weights.py | 68 -------- .../{test_subword_conformer.py => test.py} | 0 examples/conformer/test_conformer.py | 85 ---------- ...{tflite_subword_conformer.py => tflite.py} | 0 examples/conformer/tflite_conformer.py | 66 -------- ...in_keras_subword_conformer.py => train.py} | 20 +-- examples/conformer/train_conformer.py | 106 ------------- examples/conformer/train_ga_conformer.py | 108 ------------- .../conformer/train_ga_subword_conformer.py | 127 --------------- examples/conformer/train_subword_conformer.py | 124 --------------- ...eras_subword_conformer.py => train_tpu.py} | 20 +-- .../{test_subword_contextnet.py => test.py} | 0 examples/contextnet/test_contextnet.py | 85 ---------- ...tflite_subword_contextnet.py => tflite.py} | 0 examples/contextnet/tflite_contextnet.py | 67 -------- ...n_keras_subword_contextnet.py => train.py} | 0 examples/contextnet/train_contextnet.py | 106 ------------- examples/contextnet/train_ga_contextnet.py | 108 ------------- .../contextnet/train_ga_subword_contextnet.py | 122 --------------- .../contextnet/train_subword_contextnet.py | 119 -------------- examples/deepspeech2/{test_ds2.py => test.py} | 0 .../{train_keras_ds2.py => train.py} | 0 examples/deepspeech2/train_ds2.py | 88 ----------- examples/deepspeech2/train_ga_ds2.py | 91 ----------- examples/jasper/{test_jasper.py => test.py} | 0 .../{train_keras_jasper.py => train.py} | 0 examples/jasper/train_ga_jasper.py | 91 ----------- examples/jasper/train_jasper.py | 90 ----------- ...test_subword_rnn_transducer.py => test.py} | 0 .../rnn_transducer/test_rnn_transducer.py | 88 ----------- ...te_subword_rnn_transducer.py => tflite.py} | 0 .../rnn_transducer/tflite_rnn_transducer.py | 70 --------- ...ras_subword_rnn_transducer.py => train.py} | 20 +-- .../rnn_transducer/train_ga_rnn_transducer.py | 100 ------------ .../train_ga_subword_rnn_transducer.py | 116 -------------- .../rnn_transducer/train_rnn_transducer.py | 97 ------------ .../train_subword_rnn_transducer.py | 111 ------------- 44 files changed, 21 insertions(+), 2913 deletions(-) delete mode 100644 examples/conformer/masking/README.md delete mode 100644 examples/conformer/masking/masking.py delete mode 100644 examples/conformer/masking/train_ga_masking_conformer.py delete mode 100644 examples/conformer/masking/train_ga_masking_subword_conformer.py delete mode 100644 examples/conformer/masking/train_masking_conformer.py delete mode 100644 examples/conformer/masking/train_masking_subword_conformer.py delete mode 100644 examples/conformer/masking/trainer.py delete mode 100644 examples/conformer/save_conformer_from_weights.py rename examples/conformer/{test_subword_conformer.py => test.py} (100%) mode change 100755 => 100644 delete mode 100755 examples/conformer/test_conformer.py rename examples/conformer/{tflite_subword_conformer.py => tflite.py} (100%) delete mode 100644 examples/conformer/tflite_conformer.py rename examples/conformer/{train_keras_subword_conformer.py => train.py} (88%) delete mode 100644 examples/conformer/train_conformer.py delete mode 100644 examples/conformer/train_ga_conformer.py delete mode 100644 examples/conformer/train_ga_subword_conformer.py delete mode 100644 examples/conformer/train_subword_conformer.py rename examples/conformer/{train_tpu_keras_subword_conformer.py => train_tpu.py} (88%) rename examples/contextnet/{test_subword_contextnet.py => test.py} (100%) delete mode 100644 examples/contextnet/test_contextnet.py rename examples/contextnet/{tflite_subword_contextnet.py => tflite.py} (100%) delete mode 100644 examples/contextnet/tflite_contextnet.py rename examples/contextnet/{train_keras_subword_contextnet.py => train.py} (100%) delete mode 100644 examples/contextnet/train_contextnet.py delete mode 100644 examples/contextnet/train_ga_contextnet.py delete mode 100644 examples/contextnet/train_ga_subword_contextnet.py delete mode 100644 examples/contextnet/train_subword_contextnet.py rename examples/deepspeech2/{test_ds2.py => test.py} (100%) rename examples/deepspeech2/{train_keras_ds2.py => train.py} (100%) delete mode 100644 examples/deepspeech2/train_ds2.py delete mode 100644 examples/deepspeech2/train_ga_ds2.py rename examples/jasper/{test_jasper.py => test.py} (100%) rename examples/jasper/{train_keras_jasper.py => train.py} (100%) delete mode 100644 examples/jasper/train_ga_jasper.py delete mode 100644 examples/jasper/train_jasper.py rename examples/rnn_transducer/{test_subword_rnn_transducer.py => test.py} (100%) delete mode 100644 examples/rnn_transducer/test_rnn_transducer.py rename examples/rnn_transducer/{tflite_subword_rnn_transducer.py => tflite.py} (100%) delete mode 100644 examples/rnn_transducer/tflite_rnn_transducer.py rename examples/rnn_transducer/{train_keras_subword_rnn_transducer.py => train.py} (88%) delete mode 100644 examples/rnn_transducer/train_ga_rnn_transducer.py delete mode 100644 examples/rnn_transducer/train_ga_subword_rnn_transducer.py delete mode 100644 examples/rnn_transducer/train_rnn_transducer.py delete mode 100644 examples/rnn_transducer/train_subword_rnn_transducer.py diff --git a/examples/conformer/masking/README.md b/examples/conformer/masking/README.md deleted file mode 100644 index f63d41a3b9..0000000000 --- a/examples/conformer/masking/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Training Conformer with Attention Masking - -This is an example for anyone who wants to apply masking in Conformer. - -**Note**: This is not a good practice since Conformer uses time reduction, which leads to create incorrect maskings. \ No newline at end of file diff --git a/examples/conformer/masking/masking.py b/examples/conformer/masking/masking.py deleted file mode 100644 index 69f8e0b01a..0000000000 --- a/examples/conformer/masking/masking.py +++ /dev/null @@ -1,32 +0,0 @@ -import tensorflow as tf -from tensorflow_asr.utils.utils import shape_list, get_reduced_length - - -def create_padding_mask(features, input_length, time_reduction_factor): - """ - Create masking with 0 for paddings and 1 for non-paddings - Args: - features ([tf.Tensor]): audio features with shape [B, T, F, C] - input_length ([tf.Tensor]): audio features length with shape [B] - time_reduction_factor ([int]) - - Returns: - [tf.Tensor]: with shape [B, Tquery, Tkey] - """ - batch_size, padded_time, _, _ = shape_list(features) - reduced_padded_time = get_reduced_length(padded_time, time_reduction_factor) - - def create_mask(length): - reduced_length = get_reduced_length(length, time_reduction_factor) - mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32) - return tf.pad( - mask, - [ - [0, reduced_padded_time - reduced_length], - [0, reduced_padded_time - reduced_length] - ], - mode="CONSTANT", - constant_values=0.0 - ) - - return tf.map_fn(create_mask, input_length, fn_output_signature=tf.TensorSpec([None, None], dtype=tf.float32)) diff --git a/examples/conformer/masking/train_ga_masking_conformer.py b/examples/conformer/masking/train_ga_masking_conformer.py deleted file mode 100644 index 62a0deb240..0000000000 --- a/examples/conformer/masking/train_ga_masking_conformer.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, - help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", - help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, - help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, - help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, - help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], - help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", - help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", - help="Enable caching for dataset") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from trainer import TrainerWithMaskingGA -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.train_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) -else: - train_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.train_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) - -conformer_trainer = TrainerWithMaskingGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=config.model_config["encoder_dmodel"], - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"])) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/conformer/masking/train_ga_masking_subword_conformer.py b/examples/conformer/masking/train_ga_masking_subword_conformer.py deleted file mode 100644 index 1e74f9a68b..0000000000 --- a/examples/conformer/masking/train_ga_masking_subword_conformer.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, - help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", - help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, - help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, - help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, - help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], - help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", - help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", - help="Enable caching for dataset") - -parser.add_argument("--subwords", type=str, default=None, - help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], - help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from trainer import TrainerWithMaskingGA -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.train_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) -else: - train_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.train_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) - -conformer_trainer = TrainerWithMaskingGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=config.model_config["encoder_dmodel"], - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"])) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/conformer/masking/train_masking_conformer.py b/examples/conformer/masking/train_masking_conformer.py deleted file mode 100644 index 82dbbda9ec..0000000000 --- a/examples/conformer/masking/train_masking_conformer.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, - help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", - help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, - help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, - help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], - help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", - help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", - help="Enable caching for dataset") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from trainer import TrainerWithMasking -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.train_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) -else: - train_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.train_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) - -conformer_trainer = TrainerWithMasking( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer_config = config.learning_config.optimizer_config - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=config.model_config["encoder_dmodel"], - warmup_steps=optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"])) - ), - beta_1=optimizer_config["beta1"], - beta_2=optimizer_config["beta2"], - epsilon=optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/conformer/masking/train_masking_subword_conformer.py b/examples/conformer/masking/train_masking_subword_conformer.py deleted file mode 100644 index be99ec3ceb..0000000000 --- a/examples/conformer/masking/train_masking_subword_conformer.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, - help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", - help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, - help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, - help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], - help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", - help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", - help="Enable caching for dataset") - -parser.add_argument("--subwords", type=str, default=None, - help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], - help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from trainer import TrainerWithMasking -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.train_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRTFRecordDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) -else: - train_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.train_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - augmentations=config.learning_config.augmentations, - stage="train", cache=args.cache, shuffle=True - ) - eval_dataset = ASRSliceDataset( - data_paths=config.learning_config.dataset_config.eval_paths, - speech_featurizer=speech_featurizer, - text_featurizer=text_featurizer, - stage="eval", cache=args.cache, shuffle=True - ) - -conformer_trainer = TrainerWithMasking( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=config.model_config["encoder_dmodel"], - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"])) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/conformer/masking/trainer.py b/examples/conformer/masking/trainer.py deleted file mode 100644 index b860eafb2c..0000000000 --- a/examples/conformer/masking/trainer.py +++ /dev/null @@ -1,55 +0,0 @@ -import tensorflow as tf - -from masking import create_padding_mask -from tensorflow_asr.runners.transducer_runners import TransducerTrainer, TransducerTrainerGA -from tensorflow_asr.losses.rnnt_losses import rnnt_loss -from tensorflow_asr.utils.utils import get_reduced_length - - -class TrainerWithMasking(TransducerTrainer): - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, pred_inp = batch - - mask = create_padding_mask(features, input_length, self.model.time_reduction_factor) - - with tf.GradientTape() as tape: - logits = self.model([features, input_length, pred_inp, label_length + 1], training=True, mask=mask) - tape.watch(logits) - per_train_loss = rnnt_loss( - logits=logits, labels=labels, label_length=label_length, - logit_length=get_reduced_length(input_length, self.model.time_reduction_factor), - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss(per_train_loss, - global_batch_size=self.global_batch_size) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables)) - - self.train_metrics["transducer_loss"].update_state(per_train_loss) - - -class TrainerWithMaskingGA(TransducerTrainerGA): - @tf.function(experimental_relax_shapes=True) - def _train_step(self, batch): - _, features, input_length, labels, label_length, pred_inp = batch - - mask = create_padding_mask(features, input_length, self.model.time_reduction_factor) - - with tf.GradientTape() as tape: - logits = self.model([features, input_length, pred_inp, label_length + 1], training=True, mask=mask) - tape.watch(logits) - per_train_loss = rnnt_loss( - logits=logits, labels=labels, label_length=label_length, - logit_length=get_reduced_length(input_length, self.model.time_reduction_factor), - blank=self.text_featurizer.blank - ) - train_loss = tf.nn.compute_average_loss( - per_train_loss, - global_batch_size=self.global_batch_size - ) - - gradients = tape.gradient(train_loss, self.model.trainable_variables) - self.accumulation.accumulate(gradients) - self.train_metrics["transducer_loss"].update_state(per_train_loss) diff --git a/examples/conformer/save_conformer_from_weights.py b/examples/conformer/save_conformer_from_weights.py deleted file mode 100644 index bb09c7d329..0000000000 --- a/examples/conformer/save_conformer_from_weights.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_devices - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") - -parser.add_argument("--device", type=int, default=0, - help="Device's id to run test on") - -parser.add_argument("--cpu", default=False, action="store_true", - help="Whether to only use cpu") - -parser.add_argument("output", type=str, default=None, - help="Output to save whole model") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -setup_devices([args.device], cpu=args.cpu) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.conformer import Conformer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -tf.random.set_seed(0) -assert args.saved - -# build model -conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) -conformer._build(speech_featurizer.shape) -conformer.load_weights(args.saved) -conformer.summary(line_length=150) -conformer.save(args.output) - -print(f"Saved whole model to {args.output}") diff --git a/examples/conformer/test_subword_conformer.py b/examples/conformer/test.py old mode 100755 new mode 100644 similarity index 100% rename from examples/conformer/test_subword_conformer.py rename to examples/conformer/test.py diff --git a/examples/conformer/test_conformer.py b/examples/conformer/test_conformer.py deleted file mode 100755 index 17f40a6d5f..0000000000 --- a/examples/conformer/test_conformer.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_devices - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, help="Path to saved model") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") - -parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -setup_devices([args.device], cpu=args.cpu) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.conformer import Conformer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -tf.random.set_seed(0) -assert args.saved - -if args.tfrecords: - test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) -else: - test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) - -# build model -conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) -conformer._build(speech_featurizer.shape) -conformer.load_weights(args.saved) -conformer.summary(line_length=120) -conformer.add_featurizers(speech_featurizer, text_featurizer) - -conformer_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -conformer_tester.compile(conformer) -conformer_tester.run(test_dataset) diff --git a/examples/conformer/tflite_subword_conformer.py b/examples/conformer/tflite.py similarity index 100% rename from examples/conformer/tflite_subword_conformer.py rename to examples/conformer/tflite.py diff --git a/examples/conformer/tflite_conformer.py b/examples/conformer/tflite_conformer.py deleted file mode 100644 index a44997a3be..0000000000 --- a/examples/conformer/tflite_conformer.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment - -setup_environment() -import tensorflow as tf - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.conformer import Conformer - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") - -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") - -args = parser.parse_args() - -assert args.saved and args.output - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -# build model -conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) -conformer._build(speech_featurizer.shape) -conformer.load_weights(args.saved) -conformer.summary(line_length=150) -conformer.add_featurizers(speech_featurizer, text_featurizer) - -concrete_func = conformer.make_tflite_function().get_concrete_function() -converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) -converter.optimizations = [tf.lite.Optimize.DEFAULT] -converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] -tflite_model = converter.convert() - -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) -with open(args.output, "wb") as tflite_out: - tflite_out.write(tflite_model) diff --git a/examples/conformer/train_keras_subword_conformer.py b/examples/conformer/train.py similarity index 88% rename from examples/conformer/train_keras_subword_conformer.py rename to examples/conformer/train.py index 7f2219cff2..0c844062a1 100644 --- a/examples/conformer/train_keras_subword_conformer.py +++ b/examples/conformer/train.py @@ -46,9 +46,7 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") args = parser.parse_args() @@ -59,7 +57,7 @@ from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.keras.conformer import Conformer from tensorflow_asr.optimizers.schedules import TransformerSchedule @@ -68,17 +66,13 @@ if args.sentence_piece: print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) if args.tfrecords: train_dataset = ASRTFRecordDatasetKeras( diff --git a/examples/conformer/train_conformer.py b/examples/conformer/train_conformer.py deleted file mode 100644 index e919f953d3..0000000000 --- a/examples/conformer/train_conformer.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -conformer_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer_config = config.learning_config.optimizer_config - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=conformer.dmodel, - warmup_steps=optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(conformer.dmodel)) - ), - beta_1=optimizer_config["beta1"], - beta_2=optimizer_config["beta2"], - epsilon=optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/conformer/train_ga_conformer.py b/examples/conformer/train_ga_conformer.py deleted file mode 100644 index d2ca6ade2c..0000000000 --- a/examples/conformer/train_ga_conformer.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -conformer_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=conformer.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(conformer.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/conformer/train_ga_subword_conformer.py b/examples/conformer/train_ga_subword_conformer.py deleted file mode 100644 index c36d1a5468..0000000000 --- a/examples/conformer/train_ga_subword_conformer.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.sentence_piece: - print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -conformer_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=conformer.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(conformer.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/conformer/train_subword_conformer.py b/examples/conformer/train_subword_conformer.py deleted file mode 100644 index 74c143894e..0000000000 --- a/examples/conformer/train_subword_conformer.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.sentence_piece: - print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -conformer_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with conformer_trainer.strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=conformer.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(conformer.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -conformer_trainer.compile(model=conformer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/conformer/train_tpu_keras_subword_conformer.py b/examples/conformer/train_tpu.py similarity index 88% rename from examples/conformer/train_tpu_keras_subword_conformer.py rename to examples/conformer/train_tpu.py index 8162a3bdae..8a0937c985 100644 --- a/examples/conformer/train_tpu_keras_subword_conformer.py +++ b/examples/conformer/train_tpu.py @@ -42,9 +42,7 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") parser.add_argument("--saved", type=str, default=None, help="Path to saved model") @@ -59,7 +57,7 @@ from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.keras.conformer import Conformer from tensorflow_asr.optimizers.schedules import TransformerSchedule @@ -68,17 +66,13 @@ if args.sentence_piece: print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) + print("Use characters...") + text_featurizer = CharFeaturizer(config.decoder_config) train_dataset = ASRTFRecordDatasetKeras( speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, diff --git a/examples/contextnet/test_subword_contextnet.py b/examples/contextnet/test.py similarity index 100% rename from examples/contextnet/test_subword_contextnet.py rename to examples/contextnet/test.py diff --git a/examples/contextnet/test_contextnet.py b/examples/contextnet/test_contextnet.py deleted file mode 100644 index d62a9bf954..0000000000 --- a/examples/contextnet/test_contextnet.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_devices - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, help="Path to saved model") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") - -parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -setup_devices([args.device], cpu=args.cpu) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.contextnet import ContextNet - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -tf.random.set_seed(0) -assert args.saved - -if args.tfrecords: - test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) -else: - test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) - -# build model -contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) -contextnet._build(speech_featurizer.shape) -contextnet.load_weights(args.saved) -contextnet.summary(line_length=120) -contextnet.add_featurizers(speech_featurizer, text_featurizer) - -contextnet_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -contextnet_tester.compile(contextnet) -contextnet_tester.run(test_dataset) diff --git a/examples/contextnet/tflite_subword_contextnet.py b/examples/contextnet/tflite.py similarity index 100% rename from examples/contextnet/tflite_subword_contextnet.py rename to examples/contextnet/tflite.py diff --git a/examples/contextnet/tflite_contextnet.py b/examples/contextnet/tflite_contextnet.py deleted file mode 100644 index 4452ce8394..0000000000 --- a/examples/contextnet/tflite_contextnet.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment - -setup_environment() -import tensorflow as tf - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.contextnet import ContextNet - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") - -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") - -args = parser.parse_args() - -assert args.saved and args.output - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -# build model -contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) -contextnet._build(speech_featurizer.shape) -contextnet.load_weights(args.saved) -contextnet.summary(line_length=150) -contextnet.add_featurizers(speech_featurizer, text_featurizer) - -concrete_func = contextnet.make_tflite_function().get_concrete_function() -converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) -converter.optimizations = [tf.lite.Optimize.DEFAULT] -converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, - tf.lite.OpsSet.SELECT_TF_OPS] -tflite_model = converter.convert() - -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) -with open(args.output, "wb") as tflite_out: - tflite_out.write(tflite_model) diff --git a/examples/contextnet/train_keras_subword_contextnet.py b/examples/contextnet/train.py similarity index 100% rename from examples/contextnet/train_keras_subword_contextnet.py rename to examples/contextnet/train.py diff --git a/examples/contextnet/train_contextnet.py b/examples/contextnet/train_contextnet.py deleted file mode 100644 index a3a261a3fb..0000000000 --- a/examples/contextnet/train_contextnet.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.contextnet import ContextNet -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -contextnet_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with contextnet_trainer.strategy.scope(): - # build model - contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) - contextnet._build(speech_featurizer.shape) - contextnet.summary(line_length=120) - - optimizer_config = config.learning_config.optimizer_config - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=contextnet.dmodel, - warmup_steps=optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(contextnet.dmodel)) - ), - beta_1=optimizer_config["beta1"], - beta_2=optimizer_config["beta2"], - epsilon=optimizer_config["epsilon"] - ) - -contextnet_trainer.compile(model=contextnet, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -contextnet_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/contextnet/train_ga_contextnet.py b/examples/contextnet/train_ga_contextnet.py deleted file mode 100644 index d906ce2ba3..0000000000 --- a/examples/contextnet/train_ga_contextnet.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.contextnet import ContextNet -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -contextnet_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with contextnet_trainer.strategy.scope(): - # build model - contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) - contextnet._build(speech_featurizer.shape) - contextnet.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=contextnet.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(contextnet.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -contextnet_trainer.compile(model=contextnet, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -contextnet_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/contextnet/train_ga_subword_contextnet.py b/examples/contextnet/train_ga_subword_contextnet.py deleted file mode 100644 index b1f1cec0f2..0000000000 --- a/examples/contextnet/train_ga_subword_contextnet.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.contextnet import ContextNet -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -contextnet_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with contextnet_trainer.strategy.scope(): - # build model - contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) - contextnet._build(speech_featurizer.shape) - contextnet.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=contextnet.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(contextnet.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -contextnet_trainer.compile(model=contextnet, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -contextnet_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/contextnet/train_subword_contextnet.py b/examples/contextnet/train_subword_contextnet.py deleted file mode 100644 index 74e07e88da..0000000000 --- a/examples/contextnet/train_subword_contextnet.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="ContextNet Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.contextnet import ContextNet -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -contextnet_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with contextnet_trainer.strategy.scope(): - # build model - contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) - contextnet._build(speech_featurizer.shape) - contextnet.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=contextnet.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(contextnet.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - -contextnet_trainer.compile(model=contextnet, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -contextnet_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/deepspeech2/test_ds2.py b/examples/deepspeech2/test.py similarity index 100% rename from examples/deepspeech2/test_ds2.py rename to examples/deepspeech2/test.py diff --git a/examples/deepspeech2/train_keras_ds2.py b/examples/deepspeech2/train.py similarity index 100% rename from examples/deepspeech2/train_keras_ds2.py rename to examples/deepspeech2/train.py diff --git a/examples/deepspeech2/train_ds2.py b/examples/deepspeech2/train_ds2.py deleted file mode 100644 index 8f1d201ed9..0000000000 --- a/examples/deepspeech2/train_ds2.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Deep Speech 2 Training") - -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.ctc_runners import CTCTrainer -from tensorflow_asr.models.deepspeech2 import DeepSpeech2 - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -ctc_trainer = CTCTrainer(text_featurizer, config.learning_config.running_config) -# Build DS2 model -with ctc_trainer.strategy.scope(): - ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) - ds2_model._build(speech_featurizer.shape) - ds2_model.summary(line_length=120) -# Compile -ctc_trainer.compile(ds2_model, config.learning_config.optimizer_config, - max_to_keep=args.max_ckpts) - -ctc_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/deepspeech2/train_ga_ds2.py b/examples/deepspeech2/train_ga_ds2.py deleted file mode 100644 index 5996859552..0000000000 --- a/examples/deepspeech2/train_ga_ds2.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Deep Speech 2 Training") - -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.ctc_runners import CTCTrainerGA -from tensorflow_asr.models.deepspeech2 import DeepSpeech2 - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config) -# Build DS2 model -with ctc_trainer.strategy.scope(): - ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) - ds2_model._build(speech_featurizer.shape) - ds2_model.summary(line_length=120) -# Compile -ctc_trainer.compile(ds2_model, config.learning_config.optimizer_config, - max_to_keep=args.max_ckpts) - -ctc_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/jasper/test_jasper.py b/examples/jasper/test.py similarity index 100% rename from examples/jasper/test_jasper.py rename to examples/jasper/test.py diff --git a/examples/jasper/train_keras_jasper.py b/examples/jasper/train.py similarity index 100% rename from examples/jasper/train_keras_jasper.py rename to examples/jasper/train.py diff --git a/examples/jasper/train_ga_jasper.py b/examples/jasper/train_ga_jasper.py deleted file mode 100644 index 4697b97e7b..0000000000 --- a/examples/jasper/train_ga_jasper.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Jasper Training") - -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.ctc_runners import CTCTrainerGA -from tensorflow_asr.models.jasper import Jasper - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config) -# Build DS2 model -with ctc_trainer.strategy.scope(): - jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) - jasper._build(speech_featurizer.shape) - jasper.summary(line_length=120) -# Compile -ctc_trainer.compile(jasper, config.learning_config.optimizer_config, - max_to_keep=args.max_ckpts) - -ctc_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/jasper/train_jasper.py b/examples/jasper/train_jasper.py deleted file mode 100644 index 528d1eaaa4..0000000000 --- a/examples/jasper/train_jasper.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Jasper Training") - -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.ctc_runners import CTCTrainer -from tensorflow_asr.models.jasper import Jasper - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -ctc_trainer = CTCTrainer(text_featurizer, config.learning_config.running_config) -# Build DS2 model -with ctc_trainer.strategy.scope(): - jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) - jasper._build(speech_featurizer.shape) - jasper.summary(line_length=120) -# Compile -ctc_trainer.compile(jasper, config.learning_config.optimizer_config, - max_to_keep=args.max_ckpts) - -ctc_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/rnn_transducer/test_subword_rnn_transducer.py b/examples/rnn_transducer/test.py similarity index 100% rename from examples/rnn_transducer/test_subword_rnn_transducer.py rename to examples/rnn_transducer/test.py diff --git a/examples/rnn_transducer/test_rnn_transducer.py b/examples/rnn_transducer/test_rnn_transducer.py deleted file mode 100644 index b4ed2f9eee..0000000000 --- a/examples/rnn_transducer/test_rnn_transducer.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_devices - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, help="Path to saved model") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") - -parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -setup_devices([args.device], cpu=args.cpu) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -tf.random.set_seed(0) -assert args.saved - -if args.tfrecords: - test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) -else: - test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.test_dataset_config) - ) - -# build model -streaming_transducer = StreamingTransducer( - vocabulary_size=text_featurizer.num_classes, - **config.model_config -) -streaming_transducer._build(speech_featurizer.shape) -streaming_transducer.load_weights(args.saved) -streaming_transducer.summary(line_length=150) -streaming_transducer.add_featurizers(speech_featurizer, text_featurizer) - -streaming_transducer_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -streaming_transducer_tester.compile(streaming_transducer) -streaming_transducer_tester.run(test_dataset) diff --git a/examples/rnn_transducer/tflite_subword_rnn_transducer.py b/examples/rnn_transducer/tflite.py similarity index 100% rename from examples/rnn_transducer/tflite_subword_rnn_transducer.py rename to examples/rnn_transducer/tflite.py diff --git a/examples/rnn_transducer/tflite_rnn_transducer.py b/examples/rnn_transducer/tflite_rnn_transducer.py deleted file mode 100644 index 6d4627010c..0000000000 --- a/examples/rnn_transducer/tflite_rnn_transducer.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment - -setup_environment() -import tensorflow as tf - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Testing") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") - -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") - -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") - -args = parser.parse_args() - -assert args.saved and args.output - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -# build model -streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes -) -streaming_transducer._build(speech_featurizer.shape) -streaming_transducer.load_weights(args.saved) -streaming_transducer.summary(line_length=150) -streaming_transducer.add_featurizers(speech_featurizer, text_featurizer) - -concrete_func = streaming_transducer.make_tflite_function().get_concrete_function() -converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) -converter.optimizations = [tf.lite.Optimize.DEFAULT] -converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, - tf.lite.OpsSet.SELECT_TF_OPS] -tflite_model = converter.convert() - -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) -with open(args.output, "wb") as tflite_out: - tflite_out.write(tflite_model) diff --git a/examples/rnn_transducer/train_keras_subword_rnn_transducer.py b/examples/rnn_transducer/train.py similarity index 88% rename from examples/rnn_transducer/train_keras_subword_rnn_transducer.py rename to examples/rnn_transducer/train.py index c9254a4fd0..6f7c92c643 100644 --- a/examples/rnn_transducer/train_keras_subword_rnn_transducer.py +++ b/examples/rnn_transducer/train.py @@ -43,9 +43,7 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") +parser.add_argument("--subword", default=False, action="store_true", help="Use subword") args = parser.parse_args() @@ -56,22 +54,18 @@ from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer from tensorflow_asr.models.keras.streaming_transducer import StreamingTransducer config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.subword: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) if args.tfrecords: train_dataset = ASRTFRecordDatasetKeras( diff --git a/examples/rnn_transducer/train_ga_rnn_transducer.py b/examples/rnn_transducer/train_ga_rnn_transducer.py deleted file mode 100644 index 516d9d90e9..0000000000 --- a/examples/rnn_transducer/train_ga_rnn_transducer.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -streaming_transducer_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with streaming_transducer_trainer.strategy.scope(): - # build model - streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes - ) - streaming_transducer._build(speech_featurizer.shape) - streaming_transducer.summary(line_length=150) - - optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config) - -streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -streaming_transducer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/rnn_transducer/train_ga_subword_rnn_transducer.py b/examples/rnn_transducer/train_ga_subword_rnn_transducer.py deleted file mode 100644 index 96b81f4ea1..0000000000 --- a/examples/rnn_transducer/train_ga_subword_rnn_transducer.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -streaming_transducer_trainer = TransducerTrainerGA( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with streaming_transducer_trainer.strategy.scope(): - # build model - streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes - ) - streaming_transducer._build(speech_featurizer.shape) - streaming_transducer.summary(line_length=150) - - optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config) - -streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -streaming_transducer_trainer.fit(train_dataset, eval_dataset, - train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs) diff --git a/examples/rnn_transducer/train_rnn_transducer.py b/examples/rnn_transducer/train_rnn_transducer.py deleted file mode 100644 index 978c613836..0000000000 --- a/examples/rnn_transducer/train_rnn_transducer.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -streaming_transducer_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with streaming_transducer_trainer.strategy.scope(): - # build model - streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes - ) - streaming_transducer._build(speech_featurizer.shape) - streaming_transducer.summary(line_length=150) - - optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config) - -streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) diff --git a/examples/rnn_transducer/train_subword_rnn_transducer.py b/examples/rnn_transducer/train_subword_rnn_transducer.py deleted file mode 100644 index 14c937349b..0000000000 --- a/examples/rnn_transducer/train_subword_rnn_transducer.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") - -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") - -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") - -parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_strategy(args.devices) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.transducer_runners import TransducerTrainer -from tensorflow_asr.models.streaming_transducer import StreamingTransducer - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -if args.tfrecords: - train_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) -else: - train_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config) - ) - eval_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) - ) - -streaming_transducer_trainer = TransducerTrainer( - config=config.learning_config.running_config, - text_featurizer=text_featurizer, strategy=strategy -) - -with streaming_transducer_trainer.strategy.scope(): - # build model - streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes - ) - streaming_transducer._build(speech_featurizer.shape) - streaming_transducer.summary(line_length=150) - - optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config) - -streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer, - max_to_keep=args.max_ckpts) - -streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs) From d86d621b70a1b6eb2576ca88349105ece6cfcbf5 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Thu, 15 Apr 2021 00:30:07 +0700 Subject: [PATCH 07/13] :writing_hand: update conformer training script --- examples/conformer/config.yml | 32 ++-- examples/conformer/tflite.py | 30 ++-- examples/conformer/train.py | 88 ++++++----- examples/conformer/train_tpu.py | 147 ------------------ scripts/create_librispeech_trans.py | 4 +- scripts/create_tfrecords.py | 4 +- scripts/generate_metadata.py | 2 +- tensorflow_asr/configs/__init__.py | 33 ---- tensorflow_asr/configs/config.py | 13 +- tensorflow_asr/datasets/__init__.py | 17 -- tensorflow_asr/datasets/asr_dataset.py | 57 +++---- tensorflow_asr/datasets/base_dataset.py | 2 +- .../featurizers/methods/gammatone.py | 2 +- .../models/layers/positional_encoding.py | 2 +- tensorflow_asr/models/layers/subsampling.py | 8 +- tensorflow_asr/utils/file_util.py | 59 ++++--- tensorflow_asr/utils/math_util.py | 8 +- 17 files changed, 161 insertions(+), 347 deletions(-) delete mode 100644 examples/conformer/train_tpu.py diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml index 79bef5276b..0ee6487e98 100755 --- a/examples/conformer/config.yml +++ b/examples/conformer/config.yml @@ -24,14 +24,14 @@ speech_config: normalize_per_feature: False decoder_config: - vocabulary: null + vocabulary: ./vocabularies/librispeech/librispeech_train_10_1008.subwords target_vocab_size: 1000 max_subword_length: 10 blank_at_zero: True - beam_width: 5 + beam_width: 0 norm_score: True corpus_files: - - /media/nlhuy/Data/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv model_config: name: conformer @@ -40,7 +40,7 @@ model_config: filters: 144 kernel_size: 3 strides: 2 - encoder_positional_encoding: sinusoid_concat_v2 + encoder_positional_encoding: sinusoid_concat encoder_dmodel: 144 encoder_num_blocks: 16 encoder_head_size: 36 @@ -75,11 +75,10 @@ learning_config: num_masks: 1 mask_factor: 27 data_paths: - - /mnt/Data/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + tfrecords_dir: null shuffle: True cache: True - cache_percent: 0.2 buffer_size: 100 drop_remainder: True stage: train @@ -87,7 +86,7 @@ learning_config: eval_dataset_config: use_tf: True data_paths: null - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -97,7 +96,7 @@ learning_config: test_dataset_config: use_tf: True data_paths: null - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -106,26 +105,21 @@ learning_config: optimizer_config: warmup_steps: 40000 - beta1: 0.9 - beta2: 0.98 + beta_1: 0.9 + beta_2: 0.98 epsilon: 1e-9 running_config: batch_size: 2 - accumulation_steps: 4 num_epochs: 50 - outdir: /mnt/Miscellanea/Models/local/conformer - log_interval_steps: 300 - eval_interval_steps: 500 - save_interval_steps: 1000 checkpoint: - filepath: /mnt/Miscellanea/Models/local/conformer/checkpoints/{epoch:02d}.h5 + filepath: /mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5 save_best_only: True save_weights_only: False save_freq: epoch - states_dir: /mnt/Miscellanea/Models/local/conformer/states + states_dir: /mnt/e/Models/local/conformer/states tensorboard: - log_dir: /mnt/Miscellanea/Models/local/conformer/tensorboard + log_dir: /mnt/e/Models/local/conformer/tensorboard histogram_freq: 1 write_graph: True write_images: True diff --git a/examples/conformer/tflite.py b/examples/conformer/tflite.py index 29794d957e..3159f656ba 100644 --- a/examples/conformer/tflite.py +++ b/examples/conformer/tflite.py @@ -14,14 +14,14 @@ import os import argparse -from tensorflow_asr.utils import setup_environment +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer from tensorflow_asr.models.conformer import Conformer DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") @@ -30,17 +30,13 @@ parser = argparse.ArgumentParser(prog="Conformer Testing") -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") -parser.add_argument("--subwords", type=str, default=None, - help="Path to file that stores generated subwords") +parser.add_argument("--subwords", type=str, default=None, help="Use subwords") -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") +parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") args = parser.parse_args() @@ -49,17 +45,16 @@ config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.subwords: + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + text_featurizer = CharFeaturizer(config.decoder_config) # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved) -conformer.summary(line_length=150) +conformer.summary(line_length=100) conformer.add_featurizers(speech_featurizer, text_featurizer) concrete_func = conformer.make_tflite_function().get_concrete_function() @@ -69,7 +64,6 @@ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] tflite_model = converter.convert() -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) +args.output = file_util.preprocess_paths(args.output) with open(args.output, "wb") as tflite_out: tflite_out.write(tflite_model) diff --git a/examples/conformer/train.py b/examples/conformer/train.py index 0c844062a1..3b10b3c86e 100644 --- a/examples/conformer/train.py +++ b/examples/conformer/train.py @@ -15,9 +15,9 @@ import os import math import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy +from tensorflow_asr.utils import env_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") @@ -28,81 +28,86 @@ parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") + +parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") - args = parser.parse_args() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -strategy = setup_strategy(args.devices) +strategy = env_util.setup_strategy(args.devices) from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer -from tensorflow_asr.models.keras.conformer import Conformer +from tensorflow_asr.datasets import asr_dataset +from tensorflow_asr.featurizers import speech_featurizers, text_featurizers +from tensorflow_asr.models.transducer.conformer import Conformer from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) +speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer(config.decoder_config) + text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config) elif args.subwords: print("Loading subwords ...") - text_featurizer = SubwordFeaturizer(config.decoder_config) + text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) else: print("Use characters ...") - text_featurizer = CharFeaturizer(config.decoder_config) + text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: - train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) + eval_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, + **vars(config.learning_config.eval_dataset_config), + indefinite=True ) - # Update metadata calculated from both train and eval datasets - train_dataset.load_metadata(args.metadata_prefix) - eval_dataset.load_metadata(args.metadata_prefix) - # Use dynamic length - speech_featurizer.reset_length() - text_featurizer.reset_length() else: - train_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config), + eval_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, + **vars(config.learning_config.eval_dataset_config), indefinite=True ) -global_batch_size = config.learning_config.running_config.batch_size +train_dataset.load_metadata(args.metadata) +eval_dataset.load_metadata(args.metadata) + +if not args.static_length: + speech_featurizer.reset_length() + text_featurizer.reset_length() + +global_batch_size = args.tbs or config.learning_config.running_config.batch_size global_batch_size *= strategy.num_replicas_in_sync train_data_loader = train_dataset.create(global_batch_size) @@ -112,17 +117,15 @@ # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) - conformer.summary(line_length=120) + conformer.summary(line_length=100) optimizer = tf.keras.optimizers.Adam( TransformerSchedule( d_model=conformer.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], + warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000), max_lr=(0.05 / math.sqrt(conformer.dmodel)) ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] + **config.learning_config.optimizer_config ) conformer.compile( @@ -139,7 +142,10 @@ ] conformer.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps + train_data_loader, + epochs=config.learning_config.running_config.num_epochs, + validation_data=eval_data_loader, + callbacks=callbacks, + steps_per_epoch=train_dataset.total_steps, + validation_steps=eval_dataset.total_steps ) diff --git a/examples/conformer/train_tpu.py b/examples/conformer/train_tpu.py deleted file mode 100644 index 8a0937c985..0000000000 --- a/examples/conformer/train_tpu.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_tpu - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") - -parser.add_argument("--bs", type=int, default=None, help="Batch size per replica") - -parser.add_argument("--spx", type=int, default=50, help="Steps per execution for maximizing TPU performance") - -parser.add_argument("--tpu_address", type=str, default=None, help="TPU address. Leave None on Colab") - -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") - -parser.add_argument("--compute_lengths", default=False, action="store_true", help="Whether to compute lengths") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") - -parser.add_argument("--saved", type=str, default=None, help="Path to saved model") - -parser.add_argument("--validation", default=False, action="store_true", help="Enable validation dataset") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_tpu(args.tpu_address) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer -from tensorflow_asr.models.keras.conformer import Conformer -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.sentence_piece: - print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer(config.decoder_config) -elif args.subwords: - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer(config.decoder_config) -else: - print("Use characters...") - text_featurizer = CharFeaturizer(config.decoder_config) - -train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config), - indefinite=True -) - -if args.validation: - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config), - indefinite=True - ) - -if args.compute_lengths: - train_dataset.update_lengths(args.metadata_prefix) - if args.validation: - eval_dataset.update_lengths(args.metadata_prefix) - -# Update metadata calculated from both train and eval datasets -train_dataset.load_metadata(args.metadata_prefix) -if args.validation: - eval_dataset.load_metadata(args.metadata_prefix) - -batch_size = args.bs if args.bs is not None else config.learning_config.running_config.batch_size -global_batch_size = batch_size -global_batch_size *= strategy.num_replicas_in_sync - -train_data_loader = train_dataset.create(global_batch_size) -eval_data_loader = eval_dataset.create(global_batch_size) if args.validation else None -validation_steps = eval_dataset.total_steps if args.validation else None - -with strategy.scope(): - # build model - conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) - conformer._build(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size) - - if args.saved: - conformer.load_weights(args.saved, by_name=True, skip_mismatch=True) - print('Load pretrained weights successfully') - - conformer.summary(line_length=120) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=conformer.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(conformer.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - - conformer.compile( - optimizer=optimizer, - experimental_steps_per_execution=args.spx, - global_batch_size=global_batch_size, - blank=text_featurizer.blank - ) - -callbacks = [ - tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint), - tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir), - tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard) -] - -conformer.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=validation_steps -) diff --git a/scripts/create_librispeech_trans.py b/scripts/create_librispeech_trans.py index 9a84cb4039..3ad3e7ac5e 100644 --- a/scripts/create_librispeech_trans.py +++ b/scripts/create_librispeech_trans.py @@ -19,7 +19,7 @@ from tqdm.auto import tqdm import unicodedata -from tensorflow_asr.utils.utils import preprocess_paths +from tensorflow_asr.utils.file_util import preprocess_paths parser = argparse.ArgumentParser(prog="Setup LibriSpeech Transcripts") @@ -31,7 +31,7 @@ assert args.dir and args.output -args.dir = preprocess_paths(args.dir) +args.dir = preprocess_paths(args.dir, isdir=True) args.output = preprocess_paths(args.output) transcripts = [] diff --git a/scripts/create_tfrecords.py b/scripts/create_tfrecords.py index 8fe48dcd0e..32a3d520bd 100644 --- a/scripts/create_tfrecords.py +++ b/scripts/create_tfrecords.py @@ -15,7 +15,7 @@ import os import argparse from tensorflow_asr.configs.config import Config -from tensorflow_asr.utils.utils import preprocess_paths +from tensorflow_asr.utils.file_util import preprocess_paths from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer @@ -40,7 +40,7 @@ args = parser.parse_args() transcripts = preprocess_paths(args.transcripts) -tfrecords_dir = preprocess_paths(args.tfrecords_dir) +tfrecords_dir = preprocess_paths(args.tfrecords_dir, isdir=True) config = Config(args.config) diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 395e41effb..48b0315943 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -15,7 +15,7 @@ import os import argparse from tensorflow_asr.configs.config import Config -from tensorflow_asr.utils.utils import preprocess_paths +from tensorflow_asr.utils.file_util import preprocess_paths from tensorflow_asr.datasets.asr_dataset import ASRDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer diff --git a/tensorflow_asr/configs/__init__.py b/tensorflow_asr/configs/__init__.py index f4d5510355..e69de29bb2 100644 --- a/tensorflow_asr/configs/__init__.py +++ b/tensorflow_asr/configs/__init__.py @@ -1,33 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import yaml - - -def load_yaml(path): - # Fix yaml numbers https://stackoverflow.com/a/30462009/11037553 - loader = yaml.SafeLoader - loader.add_implicit_resolver( - u'tag:yaml.org,2002:float', - re.compile(u'''^(?: - [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)? - |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+) - |\\.[0-9_]+(?:[eE][-+][0-9]+)? - |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]* - |[-+]?\\.(?:inf|Inf|INF) - |\\.(?:nan|NaN|NAN))$''', re.X), - list(u'-+0123456789.')) - with open(path, "r", encoding="utf-8") as file: - return yaml.load(file, Loader=loader) diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py index da79ddd1f0..028016e853 100644 --- a/tensorflow_asr/configs/config.py +++ b/tensorflow_asr/configs/config.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import load_yaml -from ..augmentations.augments import Augmentation +from ..augmentations.augmentation import Augmentation from ..utils import file_util @@ -42,14 +41,14 @@ def __init__(self, config: dict = None): if not config: config = {} self.stage = config.pop("stage", None) self.data_paths = file_util.preprocess_paths(config.pop("data_paths", None)) - self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None)) + self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None), isdir=True) self.tfrecords_shards = config.pop("tfrecords_shards", 16) self.shuffle = config.pop("shuffle", False) self.cache = config.pop("cache", False) self.drop_remainder = config.pop("drop_remainder", True) self.buffer_size = config.pop("buffer_size", 100) self.use_tf = config.pop("use_tf", False) - self.augmentations = Augmentation(config.pop("augmentation_config", {}), use_tf=self.use_tf) + self.augmentations = Augmentation(config.pop("augmentation_config", {})) for k, v in config.items(): setattr(self, k, v) @@ -59,10 +58,6 @@ def __init__(self, config: dict = None): self.batch_size = config.pop("batch_size", 1) self.accumulation_steps = config.pop("accumulation_steps", 1) self.num_epochs = config.pop("num_epochs", 20) - self.outdir = file_util.preprocess_paths(config.pop("outdir", None)) - self.log_interval_steps = config.pop("log_interval_steps", 500) - self.save_interval_steps = config.pop("save_interval_steps", 500) - self.eval_interval_steps = config.pop("eval_interval_steps", 1000) for k, v in config.items(): setattr(self, k, v) @@ -81,7 +76,7 @@ class Config: """ User config class for training, testing or infering """ def __init__(self, path: str): - config = load_yaml(file_util.preprocess_paths(path)) + config = file_util.load_yaml(file_util.preprocess_paths(path)) self.speech_config = config.pop("speech_config", {}) self.decoder_config = config.pop("decoder_config", {}) self.model_config = config.pop("model_config", {}) diff --git a/tensorflow_asr/datasets/__init__.py b/tensorflow_asr/datasets/__init__.py index f5f8a8a1e8..e69de29bb2 100644 --- a/tensorflow_asr/datasets/__init__.py +++ b/tensorflow_asr/datasets/__init__.py @@ -1,17 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .base_dataset import BaseDataset -from .asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset -__all__ = ['BaseDataset', 'ASRDataset', 'ASRTFRecordDataset', 'ASRSliceDataset'] diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py index f2d08de6ae..1b6fdca3b6 100755 --- a/tensorflow_asr/datasets/asr_dataset.py +++ b/tensorflow_asr/datasets/asr_dataset.py @@ -60,12 +60,15 @@ def compute_metadata(self): self.speech_featurizer.update_length(input_length) self.text_featurizer.update_length(label_length) - def save_metadata(self, metadata_prefix: str = None): - if metadata_prefix is None: return - metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json" - if tf.io.gfile.exists(metadata_path): - with tf.io.gfile.GFile(metadata_path, "r") as f: - content = json.loads(f.read()) + def save_metadata(self, metadata: str = None): + if metadata is None: return + metadata = file_util.preprocess_paths(metadata) + if tf.io.gfile.exists(metadata): + with tf.io.gfile.GFile(metadata, "r") as f: + try: + content = json.loads(f.read()) + except json.JSONDecodeError: + raise ValueError(f'File {metadata} is currently not in json format. Please update the file') else: content = {} content[self.stage] = { @@ -73,17 +76,20 @@ def save_metadata(self, metadata_prefix: str = None): "max_label_length": self.text_featurizer.max_length, "num_entries": self.total_steps } - with tf.io.gfile.GFile(metadata_path, "w") as f: + with tf.io.gfile.GFile(metadata, "w") as f: f.write(json.dumps(content, indent=2)) - print(f"metadata written to {metadata_path}") - - def load_metadata(self, metadata_prefix: str = None): - if metadata_prefix is None: return - metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json" - if tf.io.gfile.exists(metadata_path): - print(f"Loading metadata from {metadata_path} ...") - with tf.io.gfile.GFile(metadata_path, "r") as f: - content = json.loads(f.read()).get(self.stage, {}) + print(f"Metadata written to {metadata}") + + def load_metadata(self, metadata: str = None): + if metadata is None: return + metadata = file_util.preprocess_paths(metadata) + if tf.io.gfile.exists(metadata): + print(f"Loading metadata from {metadata} ...") + with tf.io.gfile.GFile(metadata, "r") as f: + try: + content = json.loads(f.read()).get(self.stage, {}) + except json.JSONDecodeError: + raise ValueError(f'File {metadata} must be in json format') self.speech_featurizer.update_length(int(content.get("max_input_length", 0))) self.text_featurizer.update_length(int(content.get("max_label_length", 0))) self.total_steps = int(content.get("num_entries", 0)) @@ -123,19 +129,17 @@ def preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): with tf.device("/CPU:0"): def fn(_path: bytes, _audio: bytes, _indices: bytes): signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate) - signal = self.augmentations.signal_augment(signal) - features = self.speech_featurizer.extract(signal.numpy()) - features = self.augmentations.feature_augment(features) + features = tf.convert_to_tensor(features, tf.float32) + input_length = tf.cast(tf.shape(features)[0], tf.int32) label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32) label_length = tf.cast(tf.shape(label)[0], tf.int32) + prediction = self.text_featurizer.prepand_blank(label) prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32) - features = tf.convert_to_tensor(features, tf.float32) - input_length = tf.cast(tf.shape(features)[0], tf.int32) return _path, features, input_length, label, label_length, prediction, prediction_length @@ -147,19 +151,16 @@ def fn(_path: bytes, _audio: bytes, _indices: bytes): def tf_preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): with tf.device("/CPU:0"): signal = tf_read_raw_audio(audio, self.speech_featurizer.sample_rate) - signal = self.augmentations.signal_augment(signal) - features = self.speech_featurizer.tf_extract(signal) - features = self.augmentations.feature_augment(features) + input_length = tf.cast(tf.shape(features)[0], tf.int32) label = tf.strings.to_number(tf.strings.split(indices), out_type=tf.int32) label_length = tf.cast(tf.shape(label)[0], tf.int32) + prediction = self.text_featurizer.prepand_blank(label) prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32) - features = tf.convert_to_tensor(features, tf.float32) - input_length = tf.cast(tf.shape(features)[0], tf.int32) return path, features, input_length, label, label_length, prediction, prediction_length @@ -190,6 +191,7 @@ def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor): def process(self, dataset, batch_size): dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE) + self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder) if self.cache: dataset = dataset.cache() @@ -197,7 +199,7 @@ def process(self, dataset, batch_size): if self.shuffle: dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True) - if self.indefinite: + if self.indefinite and self.total_steps: dataset = dataset.repeat() # PADDED BATCH the dataset @@ -232,7 +234,6 @@ def process(self, dataset, batch_size): # PREFETCH to improve speed of input length dataset = dataset.prefetch(AUTOTUNE) - self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder) return dataset def create(self, batch_size: int): diff --git a/tensorflow_asr/datasets/base_dataset.py b/tensorflow_asr/datasets/base_dataset.py index 9444d3b85e..3722752b79 100644 --- a/tensorflow_asr/datasets/base_dataset.py +++ b/tensorflow_asr/datasets/base_dataset.py @@ -15,7 +15,7 @@ import tensorflow as tf -from ..augmentations.augments import Augmentation +from ..augmentations.augmentation import Augmentation BUFFER_SIZE = 100 TFRECORD_SHARDS = 16 diff --git a/tensorflow_asr/featurizers/methods/gammatone.py b/tensorflow_asr/featurizers/methods/gammatone.py index dec76d8482..34443efcb7 100644 --- a/tensorflow_asr/featurizers/methods/gammatone.py +++ b/tensorflow_asr/featurizers/methods/gammatone.py @@ -16,7 +16,7 @@ import numpy as np import tensorflow as tf -from ..utils.utils import shape_list +from ...utils.shape_util import shape_list pi = tf.constant(np.pi, dtype=tf.complex64) diff --git a/tensorflow_asr/models/layers/positional_encoding.py b/tensorflow_asr/models/layers/positional_encoding.py index 832eb2c491..bf108aa263 100755 --- a/tensorflow_asr/models/layers/positional_encoding.py +++ b/tensorflow_asr/models/layers/positional_encoding.py @@ -13,7 +13,7 @@ # limitations under the License. import tensorflow as tf -from ...utils.utils import shape_list +from ...utils.shape_util import shape_list class PositionalEncoding(tf.keras.layers.Layer): diff --git a/tensorflow_asr/models/layers/subsampling.py b/tensorflow_asr/models/layers/subsampling.py index 8a84f35205..3e69f4dcdf 100644 --- a/tensorflow_asr/models/layers/subsampling.py +++ b/tensorflow_asr/models/layers/subsampling.py @@ -14,7 +14,7 @@ import tensorflow as tf -from ...utils.utils import merge_two_last_dims, shape_list +from ...utils import shape_util, math_util class TimeReduction(tf.keras.layers.Layer): @@ -27,7 +27,7 @@ def padding(self, time): return tf.cast(new_time, dtype=tf.int32) - time def call(self, inputs, **kwargs): - shape = shape_list(inputs) + shape = shape_util.shape_list(inputs) outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]]) outputs = tf.reshape(outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor]) return outputs @@ -95,7 +95,7 @@ def call(self, inputs, training=False, **kwargs): outputs = tf.nn.relu(outputs) outputs = self.maxpool2(outputs, training=training) - return merge_two_last_dims(outputs) + return math_util.merge_two_last_dims(outputs) def get_config(self): conf = super(VggSubsampling, self).get_config() @@ -137,7 +137,7 @@ def call(self, inputs, training=False, **kwargs): outputs = tf.nn.relu(outputs) outputs = self.conv2(outputs, training=training) outputs = tf.nn.relu(outputs) - return merge_two_last_dims(outputs) + return math_util.merge_two_last_dims(outputs) def get_config(self): conf = super(Conv2dSubsampling, self).get_config() diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py index 0d69315c87..c46363d1ac 100644 --- a/tensorflow_asr/utils/file_util.py +++ b/tensorflow_asr/utils/file_util.py @@ -14,16 +14,35 @@ import os import re +import yaml import tempfile +import contextlib from typing import Union, List import tensorflow as tf -def is_hdf5_filepath(filepath): +def load_yaml(path): + # Fix yaml numbers https://stackoverflow.com/a/30462009/11037553 + loader = yaml.SafeLoader + loader.add_implicit_resolver( + u'tag:yaml.org,2002:float', + re.compile(u'''^(?: + [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)? + |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+) + |\\.[0-9_]+(?:[eE][-+][0-9]+)? + |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]* + |[-+]?\\.(?:inf|Inf|INF) + |\\.(?:nan|NaN|NAN))$''', re.X), + list(u'-+0123456789.')) + with open(path, "r", encoding="utf-8") as file: + return yaml.load(file, Loader=loader) + + +def is_hdf5_filepath(filepath: str) -> bool: return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5')) -def is_cloud_path(path): +def is_cloud_path(path: str) -> bool: """ Check if the path is on cloud (which requires tf.io.gfile) Args: @@ -35,8 +54,8 @@ def is_cloud_path(path): return bool(re.match(r"^[a-z]+://", path)) -def preprocess_paths(paths: Union[List, str]): - """Expand the path to the root "/" +def preprocess_paths(paths: Union[List[str], str], isdir: bool = False) -> Union[List[str], str]: + """ Expand the path to the root "/" and makedirs Args: paths (Union[List, str]): A path or list of paths @@ -45,20 +64,21 @@ def preprocess_paths(paths: Union[List, str]): Union[List, str]: A processed path or list of paths, return None if it's not path """ if isinstance(paths, list): - return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths] - elif isinstance(paths, str): - return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths)) - else: - return None - - -def read_bytes(path: str) -> tf.Tensor: - with tf.io.gfile.GFile(path, "rb") as f: - content = f.read() - return tf.convert_to_tensor(content, dtype=tf.string) - - -def save_file(filepath): + paths = [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths] + for path in paths: + dirpath = path if isdir else os.path.dirname(path) + if not tf.io.gfile.exists(dirpath): tf.io.gfile.makedirs(dirpath) + return paths + if isinstance(paths, str): + paths = paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths)) + dirpath = paths if isdir else os.path.dirname(paths) + if not tf.io.gfile.exists(dirpath): tf.io.gfile.makedirs(dirpath) + return paths + return None + + +@contextlib.contextmanager +def save_file(filepath: str): if is_cloud_path(filepath) and is_hdf5_filepath(filepath): _, ext = os.path.splitext(filepath) with tempfile.NamedTemporaryFile(suffix=ext) as tmp: @@ -68,7 +88,8 @@ def save_file(filepath): yield filepath -def read_file(filepath): +@contextlib.contextmanager +def read_file(filepath: str): if is_cloud_path(filepath) and is_hdf5_filepath(filepath): _, ext = os.path.splitext(filepath) with tempfile.NamedTemporaryFile(suffix=ext) as tmp: diff --git a/tensorflow_asr/utils/math_util.py b/tensorflow_asr/utils/math_util.py index 451a9bcb03..5f613b6e8b 100644 --- a/tensorflow_asr/utils/math_util.py +++ b/tensorflow_asr/utils/math_util.py @@ -25,10 +25,10 @@ def log10(x): return numerator / denominator -def get_num_batches(samples, batch_size, drop_remainders=True): - if samples is None or batch_size is None: return None - if drop_remainders: return math.floor(float(samples) / float(batch_size)) - return math.ceil(float(samples) / float(batch_size)) +def get_num_batches(nsamples, batch_size, drop_remainders=True): + if nsamples is None or batch_size is None: return None + if drop_remainders: return math.floor(float(nsamples) / float(batch_size)) + return math.ceil(float(nsamples) / float(batch_size)) def nan_to_zero(input_tensor): From 4d07e9c48ea04dab0ed02229135da313cfecaf01 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Fri, 16 Apr 2021 01:24:23 +0700 Subject: [PATCH 08/13] :writing_hand: update testing script --- examples/conformer/test.py | 62 +++++++++++-------- tensorflow_asr/losses/ctc_loss.py | 10 ++- tensorflow_asr/losses/rnnt_loss.py | 10 ++- tensorflow_asr/models/base_model.py | 7 ++- tensorflow_asr/models/ctc/ctc.py | 15 +++-- .../models/transducer/contextnet.py | 34 +++------- .../models/transducer/rnn_transducer.py | 30 ++++----- .../models/transducer/transducer.py | 33 ++++------ tensorflow_asr/utils/env_util.py | 3 +- 9 files changed, 91 insertions(+), 113 deletions(-) diff --git a/examples/conformer/test.py b/examples/conformer/test.py index da47ab3dc4..4c1f8e13d1 100644 --- a/examples/conformer/test.py +++ b/examples/conformer/test.py @@ -14,9 +14,9 @@ import os import argparse -from tensorflow_asr.utils import setup_environment, setup_devices +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") @@ -33,52 +33,57 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") +parser.add_argument("--bs", type=int, default=None, help="Test batch size") + parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") +parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath") args = parser.parse_args() +assert args.saved + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -setup_devices([args.device], cpu=args.cpu) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.conformer import Conformer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer +from tensorflow_asr.models.transducer.conformer import Conformer config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: - print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) + print("Use SentencePiece ...") + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) -assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) else: test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) @@ -86,12 +91,19 @@ conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.load_weights(args.saved) -conformer.summary(line_length=120) +conformer.summary(line_length=100) conformer.add_featurizers(speech_featurizer, text_featurizer) -conformer_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -conformer_tester.compile(conformer) -conformer_tester.run(test_dataset) +batch_size = args.bs or config.learning_config.running_config.batch_size +test_data_loader = test_dataset.create(batch_size) + +results = conformer.predict(test_data_loader) + +with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + for i, entry in test_dataset.entries: + groundtruth, greedy, beamsearch = results[i] + path, duration, _ = entry + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") diff --git a/tensorflow_asr/losses/ctc_loss.py b/tensorflow_asr/losses/ctc_loss.py index 6808c57b15..89519a4e60 100644 --- a/tensorflow_asr/losses/ctc_loss.py +++ b/tensorflow_asr/losses/ctc_loss.py @@ -21,13 +21,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None): self.global_batch_size = global_batch_size def call(self, y_true, y_pred): - logits, logits_length = y_pred.values() - labels, labels_length = y_true.values() loss = ctc_loss( - y_pred=logits, - input_length=logits_length, - y_true=labels, - label_length=labels_length, + y_pred=y_pred["logits"], + input_length=y_pred["logits_length"], + y_true=y_true["labels"], + label_length=y_true["labels_length"], blank=self.blank, name=self.name ) diff --git a/tensorflow_asr/losses/rnnt_loss.py b/tensorflow_asr/losses/rnnt_loss.py index 646ec4586f..85da24dd5f 100644 --- a/tensorflow_asr/losses/rnnt_loss.py +++ b/tensorflow_asr/losses/rnnt_loss.py @@ -37,13 +37,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None): self.global_batch_size = global_batch_size def call(self, y_true, y_pred): - logits, logits_length = y_pred.values() - labels, labels_length = y_true.values() loss = rnnt_loss( - logits=logits, - logit_length=logits_length, - labels=labels, - label_length=labels_length, + logits=y_pred["logits"], + logit_length=y_pred["logits_length"], + labels=y_true["labels"], + label_length=y_true["labels_length"], blank=self.blank, name=self.name ) diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py index b8378410e2..f2eab7e0ec 100644 --- a/tensorflow_asr/models/base_model.py +++ b/tensorflow_asr/models/base_model.py @@ -111,9 +111,12 @@ def predict_step(self, batch): [tf.Tensor]: stacked tensor of shape [B, 3] with each row is the text [truth, greedy, beam_search] """ inputs, y_true = batch - labels = self.text_featurizer.iextract(y_true) + labels = self.text_featurizer.iextract(y_true["labels"]) greedy_decoding = self.recognize(inputs) - beam_search_decoding = self.recognize_beam(inputs) + if self.text_featurizer.decoder_config.beam_width == 0: + beam_search_decoding = tf.map_fn(lambda _: tf.convert_to_tensor("", dtype=tf.string), labels) + else: + beam_search_decoding = self.recognize_beam(inputs) return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1) def recognize(self, features, input_lengths, **kwargs): diff --git a/tensorflow_asr/models/ctc/ctc.py b/tensorflow_asr/models/ctc/ctc.py index ab0b60da16..a30c0e166e 100644 --- a/tensorflow_asr/models/ctc/ctc.py +++ b/tensorflow_asr/models/ctc/ctc.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Union +from typing import Dict, Union import numpy as np import tensorflow as tf @@ -69,19 +69,18 @@ def add_featurizers(self, self.text_featurizer = text_featurizer def call(self, inputs, training=False, **kwargs): - inputs, inputs_length, _, _ = inputs.values() - logits = self.encoder(inputs, training=training, **kwargs) + logits = self.encoder(inputs["inputs"], training=training, **kwargs) logits = self.decoder(logits, training=training, **kwargs) return data_util.create_logits( logits=logits, - logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor) + logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) ) # -------------------------------- GREEDY ------------------------------------- @tf.function - def recognize(self, features: tf.Tensor, input_length: Optional[tf.Tensor]): - logits = self(features, training=False) + def recognize(self, inputs: Dict[str, tf.Tensor]): + logits = self(inputs["inputs"], training=False) probs = tf.nn.softmax(logits) def map_fn(prob): return tf.numpy_function(self._perform_greedy, inp=[prob], Tout=tf.string) @@ -119,8 +118,8 @@ def recognize_tflite(self, signal): # -------------------------------- BEAM SEARCH ------------------------------------- @tf.function - def recognize_beam(self, features: tf.Tensor, input_length: Optional[tf.Tensor], lm: bool = False): - logits = self(features, training=False) + def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False): + logits = self(inputs["inputs"], training=False) probs = tf.nn.softmax(logits) def map_fn(prob): return tf.numpy_function(self._perform_beam_search, inp=[prob, lm], Tout=tf.string) diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py index 2f47f100ee..bc81f17fe7 100644 --- a/tensorflow_asr/models/transducer/contextnet.py +++ b/tensorflow_asr/models/transducer/contextnet.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import Dict, List import tensorflow as tf from ..encoders.contextnet import ContextNetEncoder, L2 from .transducer import Transducer +from ...utils import math_util class ContextNet(Transducer): @@ -95,11 +96,7 @@ def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor): # -------------------------------- GREEDY ------------------------------------- @tf.function - def recognize(self, - features: tf.Tensor, - input_length: tf.Tensor, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize(self, inputs: Dict[str, tf.Tensor]): """ RNN Transducer Greedy decoding Args: @@ -108,12 +105,9 @@ def recognize(self, Returns: tf.Tensor: a batch of decoded transcripts """ - encoded = self.encoder([features, input_length], training=False) - return self._perform_greedy_batch( - encoded, input_length, - parallel_iterations=parallel_iterations, - swap_memory=swap_memory - ) + encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length) def recognize_tflite(self, signal, predicted, prediction_states): """ @@ -161,12 +155,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, states): # -------------------------------- BEAM SEARCH ------------------------------------- @tf.function - def recognize_beam(self, - features: tf.Tensor, - input_length: tf.Tensor, - lm: bool = False, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False): """ RNN Transducer Beam Search Args: @@ -176,9 +165,6 @@ def recognize_beam(self, Returns: tf.Tensor: a batch of decoded transcripts """ - encoded = self.encoder([features, input_length], training=False) - return self._perform_beam_search_batch( - encoded, input_length, lm, - parallel_iterations=parallel_iterations, - swap_memory=swap_memory - ) + encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm) diff --git a/tensorflow_asr/models/transducer/rnn_transducer.py b/tensorflow_asr/models/transducer/rnn_transducer.py index 88ef18d80c..b02b9d7113 100644 --- a/tensorflow_asr/models/transducer/rnn_transducer.py +++ b/tensorflow_asr/models/transducer/rnn_transducer.py @@ -13,6 +13,7 @@ # limitations under the License. """ http://arxiv.org/abs/1811.06621 """ +from typing import Dict import tensorflow as tf from ..layers.subsampling import TimeReduction @@ -256,11 +257,7 @@ def encoder_inference(self, features: tf.Tensor, states: tf.Tensor): # -------------------------------- GREEDY ------------------------------------- @tf.function - def recognize(self, - features: tf.Tensor, - input_length: tf.Tensor, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize(self, inputs: Dict[str, tf.Tensor]): """ RNN Transducer Greedy decoding Args: @@ -269,10 +266,10 @@ def recognize(self, Returns: tf.Tensor: a batch of decoded transcripts """ - batch_size, _, _, _ = shape_util.shape_list(features) - encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) - return self._perform_greedy_batch(encoded, input_length, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"]) + encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size)) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length) def recognize_tflite(self, signal, predicted, encoder_states, prediction_states): """ @@ -321,12 +318,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, pre # -------------------------------- BEAM SEARCH ------------------------------------- @tf.function - def recognize_beam(self, - features: tf.Tensor, - input_length: tf.Tensor, - lm: bool = False, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False): """ RNN Transducer Beam Search Args: @@ -336,10 +328,10 @@ def recognize_beam(self, Returns: tf.Tensor: a batch of decoded transcripts """ - batch_size, _, _, _ = shape_util.shape_list(features) - encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size)) - return self._perform_beam_search_batch(encoded, input_length, lm, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"]) + encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size)) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm) # -------------------------------- TFLITE ------------------------------------- diff --git a/tensorflow_asr/models/transducer/transducer.py b/tensorflow_asr/models/transducer/transducer.py index 8917bf5a3b..a68ce66b84 100644 --- a/tensorflow_asr/models/transducer/transducer.py +++ b/tensorflow_asr/models/transducer/transducer.py @@ -14,6 +14,7 @@ """ https://arxiv.org/pdf/1811.06621.pdf """ import collections +from typing import Dict import tensorflow as tf from ..base_model import BaseModel @@ -347,13 +348,12 @@ def compile(self, super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs) def call(self, inputs, training=False, **kwargs): - inputs, inputs_length, predictions, predictions_length = inputs.values() - enc = self.encoder(inputs, training=training, **kwargs) - pred = self.predict_net([predictions, predictions_length], training=training, **kwargs) + enc = self.encoder(inputs["inputs"], training=training, **kwargs) + pred = self.predict_net([inputs["predictions"], inputs["predictions_length"]], training=training, **kwargs) logits = self.joint_net([enc, pred], training=training, **kwargs) return data_util.create_logits( logits=logits, - logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor) + logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) ) # -------------------------------- INFERENCES ------------------------------------- @@ -400,11 +400,7 @@ def get_config(self): # -------------------------------- GREEDY ------------------------------------- @tf.function - def recognize(self, - features: tf.Tensor, - input_length: tf.Tensor, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize(self, inputs: Dict[str, tf.Tensor]): """ RNN Transducer Greedy decoding Args: @@ -414,9 +410,9 @@ def recognize(self, Returns: tf.Tensor: a batch of decoded transcripts """ - encoded = self.encoder(features, training=False) - return self._perform_greedy_batch(encoded, input_length, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + encoded = self.encoder(inputs["inputs"], training=False) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length) def recognize_tflite(self, signal, predicted, states): """ @@ -600,12 +596,7 @@ def body(_time, _hypothesis): # -------------------------------- BEAM SEARCH ------------------------------------- @tf.function - def recognize_beam(self, - features: tf.Tensor, - input_length: tf.Tensor, - lm: bool = False, - parallel_iterations: int = 10, - swap_memory: bool = True): + def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False): """ RNN Transducer Beam Search Args: @@ -615,9 +606,9 @@ def recognize_beam(self, Returns: tf.Tensor: a batch of decoded transcripts """ - encoded = self.encoder(features, training=False) - return self._perform_beam_search_batch(encoded, input_length, lm, - parallel_iterations=parallel_iterations, swap_memory=swap_memory) + encoded = self.encoder(inputs["inputs"], training=False) + encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm) def _perform_beam_search_batch(self, encoded: tf.Tensor, diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py index 2bf4970415..c5564b543e 100644 --- a/tensorflow_asr/utils/env_util.py +++ b/tensorflow_asr/utils/env_util.py @@ -12,15 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings import tensorflow as tf def setup_environment(): # Set memory growth and only log ERRORs """ Setting tensorflow running environment """ - import warnings warnings.simplefilter("ignore") tf.get_logger().setLevel("ERROR") - tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) def setup_devices(devices, cpu=False): From 4dbbb175fd723e778a5008ca7d11d76426679e0b Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 17 Apr 2021 13:25:43 +0700 Subject: [PATCH 09/13] :writing_hand: update testing functions and scripts --- examples/conformer/test.py | 27 ++++++++++------ tensorflow_asr/metrics/error_rates.py | 2 +- tensorflow_asr/models/base_model.py | 16 +++++++--- tensorflow_asr/utils/app_util.py | 45 +++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 tensorflow_asr/utils/app_util.py diff --git a/examples/conformer/test.py b/examples/conformer/test.py index 4c1f8e13d1..12874187ba 100644 --- a/examples/conformer/test.py +++ b/examples/conformer/test.py @@ -13,6 +13,7 @@ # limitations under the License. import os +from tqdm import tqdm import argparse from tensorflow_asr.utils import env_util, file_util @@ -58,6 +59,7 @@ from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.transducer.conformer import Conformer +from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) @@ -97,13 +99,20 @@ batch_size = args.bs or config.learning_config.running_config.batch_size test_data_loader = test_dataset.create(batch_size) -results = conformer.predict(test_data_loader) - with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: - print(f"Saving result to {args.output} ...") - with open(filepath, "w") as openfile: - openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") - for i, entry in test_dataset.entries: - groundtruth, greedy, beamsearch = results[i] - path, duration, _ = entry - openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + overwrite = False + if tf.io.gfile.exists(filepath): + overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y" + if overwrite: + results = conformer.predict(test_data_loader, verbose=1) + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + progbar = tqdm(total=test_dataset.total_steps, unit="batch") + for i, pred in enumerate(results): + groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred] + path, duration, _ = test_dataset.entries[i] + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + progbar.update(1) + progbar.close() + app_util.evaluate_results(filepath) diff --git a/tensorflow_asr/metrics/error_rates.py b/tensorflow_asr/metrics/error_rates.py index 143e199109..2d6880e35e 100644 --- a/tensorflow_asr/metrics/error_rates.py +++ b/tensorflow_asr/metrics/error_rates.py @@ -30,4 +30,4 @@ def update_state(self, decode: tf.Tensor, target: tf.Tensor): self.denominator.assign_add(d) def result(self): - return tf.math.divide_no_nan(self.numerator, self.denominator) * 100 + return tf.math.divide_no_nan(self.numerator, self.denominator) diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py index f2eab7e0ec..1ebf8787e5 100644 --- a/tensorflow_asr/models/base_model.py +++ b/tensorflow_asr/models/base_model.py @@ -19,6 +19,10 @@ class BaseModel(tf.keras.Model): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._metrics = {} + def save(self, filepath, overwrite=True, @@ -66,7 +70,10 @@ def load_weights(self, @property def metrics(self): - return [self.loss_metric] + return self._metrics.values() + + def add_metric(self, metric: tf.keras.metrics.Metric): + self._metrics.append({metric.name: metric}) def _build(self, *args, **kwargs): raise NotImplementedError() @@ -76,7 +83,8 @@ def compile(self, loss, optimizer, run_eagerly=None, **kwargs): if not env_util.has_tpu(): optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic") self.use_loss_scale = True - self.loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) + loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) + self._metrics = {loss_metric.name: loss_metric} super().compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs) # -------------------------------- STEP FUNCTIONS ------------------------------------- @@ -92,14 +100,14 @@ def train_step(self, batch): if self.use_loss_scale: gradients = self.optimizer.get_unscaled_gradients(gradients) self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) - self.loss_metric.update_state(loss) + self._metrics["loss"].update_state(loss) return {m.name: m.result() for m in self.metrics} def test_step(self, batch): inputs, y_true = batch y_pred = self(inputs, training=False) loss = self.loss(y_true, y_pred) - self.loss_metric.update_state(loss) + self._metrics["loss"].update_state(loss) return {m.name: m.result() for m in self.metrics} def predict_step(self, batch): diff --git a/tensorflow_asr/utils/app_util.py b/tensorflow_asr/utils/app_util.py new file mode 100644 index 0000000000..b996a8030f --- /dev/null +++ b/tensorflow_asr/utils/app_util.py @@ -0,0 +1,45 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tqdm import tqdm +import tensorflow as tf + +from .metric_util import wer, cer +from ..metrics.error_rates import ErrorRate +from .file_util import read_file + + +def evaluate_results(filepath: str): + print(f"Evaluating result from {filepath} ...") + metrics = { + "greedy_wer": ErrorRate(wer, name="greedy_wer", dtype=tf.float32), + "greedy_cer": ErrorRate(cer, name="greedy_cer", dtype=tf.float32), + "beamsearch_wer": ErrorRate(wer, name="beamsearch_wer", dtype=tf.float32), + "beamsearch_cer": ErrorRate(cer, name="beamsearch_cer", dtype=tf.float32) + } + with read_file(filepath) as path: + with open(path, "r", encoding="utf-8") as openfile: + lines = openfile.read().splitlines() + lines = lines[1:] # skip header + for eachline in tqdm(lines): + _, _, groundtruth, greedy, beamsearch = eachline.split("\t") + groundtruth = tf.convert_to_tensor([groundtruth], dtype=tf.string) + greedy = tf.convert_to_tensor([greedy], dtype=tf.string) + beamsearch = tf.convert_to_tensor([beamsearch], dtype=tf.string) + metrics["greedy_wer"].update_state(decode=greedy, target=groundtruth) + metrics["greedy_cer"].update_state(decode=greedy, target=groundtruth) + metrics["beamsearch_wer"].update_state(decode=beamsearch, target=groundtruth) + metrics["beamsearch_cer"].update_state(decode=beamsearch, target=groundtruth) + for key, value in metrics.items(): + print(f"{key}: {value.result().numpy()}") From 51d8c5524bf44d9e7841ae48378fd4de801576a9 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 17 Apr 2021 13:41:34 +0700 Subject: [PATCH 10/13] :writing_hand: update example scripts --- examples/conformer/tflite.py | 4 +- examples/contextnet/test.py | 74 ++++++--- examples/contextnet/tflite.py | 38 ++--- examples/contextnet/train.py | 98 ++++++------ .../train_tpu_keras_subword_contextnet.py | 144 ------------------ examples/deepspeech2/test.py | 94 ++++++++---- examples/deepspeech2/tflite.py | 69 +++++++++ examples/deepspeech2/train.py | 106 ++++++++----- examples/demonstration/conformer.py | 11 +- examples/jasper/test.py | 92 +++++++---- examples/jasper/tflite.py | 69 +++++++++ examples/jasper/train.py | 93 ++++++----- examples/rnn_transducer/test.py | 87 +++++++---- examples/rnn_transducer/tflite.py | 53 +++---- examples/rnn_transducer/train.py | 113 ++++++++------ 15 files changed, 655 insertions(+), 490 deletions(-) delete mode 100644 examples/contextnet/train_tpu_keras_subword_contextnet.py create mode 100644 examples/deepspeech2/tflite.py create mode 100644 examples/jasper/tflite.py diff --git a/examples/conformer/tflite.py b/examples/conformer/tflite.py index 3159f656ba..b0d40b0679 100644 --- a/examples/conformer/tflite.py +++ b/examples/conformer/tflite.py @@ -22,13 +22,13 @@ from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer -from tensorflow_asr.models.conformer import Conformer +from tensorflow_asr.models.transducer.conformer import Conformer DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Conformer Testing") +parser = argparse.ArgumentParser(prog="Conformer TFLite") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") diff --git a/examples/contextnet/test.py b/examples/contextnet/test.py index 0aaabce52b..afa6c6211b 100644 --- a/examples/contextnet/test.py +++ b/examples/contextnet/test.py @@ -13,17 +13,18 @@ # limitations under the License. import os +from tqdm import tqdm import argparse -from tensorflow_asr.utils import setup_environment, setup_devices +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="ContextNet Testing") +parser = argparse.ArgumentParser(prog="Contextnet Testing") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") @@ -33,47 +34,58 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") +parser.add_argument("--bs", type=int, default=None, help="Test batch size") + +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") +parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath") args = parser.parse_args() +assert args.saved + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -setup_devices([args.device], cpu=args.cpu) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.contextnet import ContextNet +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer +from tensorflow_asr.models.transducer.contextnet import ContextNet +from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.sentence_piece: + print("Use SentencePiece ...") + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) -assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) else: test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) @@ -81,12 +93,26 @@ contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) contextnet._build(speech_featurizer.shape) contextnet.load_weights(args.saved) -contextnet.summary(line_length=120) +contextnet.summary(line_length=100) contextnet.add_featurizers(speech_featurizer, text_featurizer) -contextnet_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -contextnet_tester.compile(contextnet) -contextnet_tester.run(test_dataset) +batch_size = args.bs or config.learning_config.running_config.batch_size +test_data_loader = test_dataset.create(batch_size) + +with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: + overwrite = False + if tf.io.gfile.exists(filepath): + overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y" + if overwrite: + results = contextnet.predict(test_data_loader, verbose=1) + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + progbar = tqdm(total=test_dataset.total_steps, unit="batch") + for i, pred in enumerate(results): + groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred] + path, duration, _ = test_dataset.entries[i] + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + progbar.update(1) + progbar.close() + app_util.evaluate_results(filepath) diff --git a/examples/contextnet/tflite.py b/examples/contextnet/tflite.py index a76e4a6b78..0e8852cb19 100644 --- a/examples/contextnet/tflite.py +++ b/examples/contextnet/tflite.py @@ -14,33 +14,29 @@ import os import argparse -from tensorflow_asr.utils import setup_environment +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.models.contextnet import ContextNet +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer +from tensorflow_asr.models.transducer.contextnet import ContextNet DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="ContextNet Testing") +parser = argparse.ArgumentParser(prog="ContextNet TFLite") -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") -parser.add_argument("--subwords", type=str, default=None, - help="Path to file that stores generated subwords") +parser.add_argument("--subwords", type=str, default=None, help="Use subwords") -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") +parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") args = parser.parse_args() @@ -49,27 +45,25 @@ config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.subwords: + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + text_featurizer = CharFeaturizer(config.decoder_config) # build model contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) contextnet._build(speech_featurizer.shape) contextnet.load_weights(args.saved) -contextnet.summary(line_length=150) +contextnet.summary(line_length=100) contextnet.add_featurizers(speech_featurizer, text_featurizer) concrete_func = contextnet.make_tflite_function().get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) +converter.experimental_new_converter = True converter.optimizations = [tf.lite.Optimize.DEFAULT] -converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, - tf.lite.OpsSet.SELECT_TF_OPS] +converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] tflite_model = converter.convert() -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) +args.output = file_util.preprocess_paths(args.output) with open(args.output, "wb") as tflite_out: tflite_out.write(tflite_model) diff --git a/examples/contextnet/train.py b/examples/contextnet/train.py index 4046cfb858..7644fdeabe 100644 --- a/examples/contextnet/train.py +++ b/examples/contextnet/train.py @@ -15,96 +15,99 @@ import os import math import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy +from tensorflow_asr.utils import env_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="ContextNet Training") +parser = argparse.ArgumentParser(prog="Contextnet Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") + +parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - args = parser.parse_args() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -strategy = setup_strategy(args.devices) +strategy = env_util.setup_strategy(args.devices) from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.models.keras.contextnet import ContextNet +from tensorflow_asr.datasets import asr_dataset +from tensorflow_asr.featurizers import speech_featurizers, text_featurizers +from tensorflow_asr.models.transducer.contextnet import ContextNet from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) +speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): +if args.sentence_piece: + print("Loading SentencePiece model ...") + text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) + text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) else: - print("Generating subwords ...") - text_featurizer = SubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) + print("Use characters ...") + text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: - train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + eval_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True ) - # Update metadata calculated from both train and eval datasets - train_dataset.load_metadata(args.metadata_prefix) - eval_dataset.load_metadata(args.metadata_prefix) - # Use dynamic length - speech_featurizer.reset_length() - text_featurizer.reset_length() else: - train_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + eval_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True ) -global_batch_size = config.learning_config.running_config.batch_size +train_dataset.load_metadata(args.metadata) +eval_dataset.load_metadata(args.metadata) + +if not args.static_length: + speech_featurizer.reset_length() + text_featurizer.reset_length() + +global_batch_size = args.tbs or config.learning_config.running_config.batch_size global_batch_size *= strategy.num_replicas_in_sync train_data_loader = train_dataset.create(global_batch_size) @@ -114,17 +117,15 @@ # build model contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) contextnet._build(speech_featurizer.shape) - contextnet.summary(line_length=120) + contextnet.summary(line_length=100) optimizer = tf.keras.optimizers.Adam( TransformerSchedule( d_model=contextnet.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], + warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000), max_lr=(0.05 / math.sqrt(contextnet.dmodel)) ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] + **config.learning_config.optimizer_config ) contextnet.compile( @@ -141,7 +142,10 @@ ] contextnet.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps + train_data_loader, + epochs=config.learning_config.running_config.num_epochs, + validation_data=eval_data_loader, + callbacks=callbacks, + steps_per_epoch=train_dataset.total_steps, + validation_steps=eval_dataset.total_steps ) diff --git a/examples/contextnet/train_tpu_keras_subword_contextnet.py b/examples/contextnet/train_tpu_keras_subword_contextnet.py deleted file mode 100644 index f0bc5e64a8..0000000000 --- a/examples/contextnet/train_tpu_keras_subword_contextnet.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2020 Huy Le Nguyen (@usimarit) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import argparse -from tensorflow_asr.utils import setup_environment, setup_tpu - -setup_environment() -import tensorflow as tf - -DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") - -tf.keras.backend.clear_session() - -parser = argparse.ArgumentParser(prog="Conformer Training") - -parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") - -parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") - -parser.add_argument("--bs", type=int, default=None, help="Batch size per replica") - -parser.add_argument("--spx", type=int, default=50, help="Steps per execution for maximizing TPU performance") - -parser.add_argument("--tpu_address", type=str, default=None, help="TPU address. Leave None on Colab") - -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") - -parser.add_argument("--compute_lengths", default=False, action="store_true", help="Whether to compute lengths") - -parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") - -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") - -parser.add_argument("--saved", type=str, default=None, help="Path to saved model") - -args = parser.parse_args() - -tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) - -strategy = setup_tpu(args.tpu_address) - -from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import TFSubwordFeaturizer, SentencePieceFeaturizer -from tensorflow_asr.models.keras.contextnet import ContextNet -from tensorflow_asr.optimizers.schedules import TransformerSchedule - -config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.sentence_piece: - print("Loading SentencePiece model ...") - text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords) -elif args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = TFSubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) -else: - print("Generating subwords ...") - text_featurizer = TFSubwordFeaturizer.build_from_corpus( - config.decoder_config, - corpus_files=args.subwords_corpus - ) - text_featurizer.save_to_file(args.subwords) - -train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.train_dataset_config), - indefinite=True -) -eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config), - indefinite=True -) - -if args.compute_lengths: - train_dataset.update_lengths(args.metadata_prefix) - eval_dataset.update_lengths(args.metadata_prefix) - -# Update metadata calculated from both train and eval datasets -train_dataset.load_metadata(args.metadata_prefix) -eval_dataset.load_metadata(args.metadata_prefix) - -batch_size = args.bs if args.bs is not None else config.learning_config.running_config.batch_size -global_batch_size = batch_size -global_batch_size *= strategy.num_replicas_in_sync - -train_data_loader = train_dataset.create(global_batch_size) -eval_data_loader = eval_dataset.create(global_batch_size) - -with strategy.scope(): - # build model - contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes) - contextnet._build(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size) - contextnet.summary(line_length=120) - - if args.saved: - contextnet.load_weights(args.saved, by_name=True, skip_mismatch=True) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=contextnet.dmodel, - warmup_steps=config.learning_config.optimizer_config["warmup_steps"], - max_lr=(0.05 / math.sqrt(contextnet.dmodel)) - ), - beta_1=config.learning_config.optimizer_config["beta1"], - beta_2=config.learning_config.optimizer_config["beta2"], - epsilon=config.learning_config.optimizer_config["epsilon"] - ) - - contextnet.compile( - optimizer=optimizer, - experimental_steps_per_execution=args.spx, - global_batch_size=global_batch_size, - blank=text_featurizer.blank - ) - -callbacks = [ - tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint), - tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir), - tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard) -] - -contextnet.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps -) diff --git a/examples/deepspeech2/test.py b/examples/deepspeech2/test.py index 096add656f..d475be31c1 100644 --- a/examples/deepspeech2/test.py +++ b/examples/deepspeech2/test.py @@ -13,70 +13,106 @@ # limitations under the License. import os +from tqdm import tqdm import argparse -from tensorflow_asr.utils import setup_environment, setup_devices +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Deep Speech 2 Tester") +parser = argparse.ArgumentParser(prog="DeepSpeech2 Testing") -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--saved", type=str, default=None, help="Path to the model file to be exported") +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") +parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") +parser.add_argument("--bs", type=int, default=None, help="Test batch size") + +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") +parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") + +parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath") args = parser.parse_args() +assert args.saved + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -setup_devices([args.device]) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.deepspeech2 import DeepSpeech2 - -tf.random.set_seed(0) -assert args.export +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer +from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2 +from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) -# Build DS2 model -ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) -ds2_model._build(speech_featurizer.shape) -ds2_model.load_weights(args.saved) -ds2_model.summary(line_length=120) -ds2_model.add_featurizers(speech_featurizer, text_featurizer) + +if args.sentence_piece: + print("Use SentencePiece ...") + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) +else: + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) + +tf.random.set_seed(0) if args.tfrecords: test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) else: test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) -ctc_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -ctc_tester.compile(ds2_model) -ctc_tester.run(test_dataset) +# build model +deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) +deepspeech2._build(speech_featurizer.shape) +deepspeech2.load_weights(args.saved) +deepspeech2.summary(line_length=100) +deepspeech2.add_featurizers(speech_featurizer, text_featurizer) + +batch_size = args.bs or config.learning_config.running_config.batch_size +test_data_loader = test_dataset.create(batch_size) + +with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: + overwrite = False + if tf.io.gfile.exists(filepath): + overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y" + if overwrite: + results = deepspeech2.predict(test_data_loader, verbose=1) + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + progbar = tqdm(total=test_dataset.total_steps, unit="batch") + for i, pred in enumerate(results): + groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred] + path, duration, _ = test_dataset.entries[i] + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + progbar.update(1) + progbar.close() + app_util.evaluate_results(filepath) diff --git a/examples/deepspeech2/tflite.py b/examples/deepspeech2/tflite.py new file mode 100644 index 0000000000..81980e1fb2 --- /dev/null +++ b/examples/deepspeech2/tflite.py @@ -0,0 +1,69 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +from tensorflow_asr.utils import env_util, file_util + +env_util.setup_environment() +import tensorflow as tf + +from tensorflow_asr.configs.config import Config +from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer +from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2 + +DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") + +tf.keras.backend.clear_session() + +parser = argparse.ArgumentParser(prog="DeepSpeech2 TFLite") + +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") + +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") + +parser.add_argument("--subwords", type=str, default=None, help="Use subwords") + +parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") + +args = parser.parse_args() + +assert args.saved and args.output + +config = Config(args.config) +speech_featurizer = TFSpeechFeaturizer(config.speech_config) + +if args.subwords: + text_featurizer = SubwordFeaturizer(config.decoder_config) +else: + text_featurizer = CharFeaturizer(config.decoder_config) + +# build model +deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) +deepspeech2._build(speech_featurizer.shape) +deepspeech2.load_weights(args.saved) +deepspeech2.summary(line_length=100) +deepspeech2.add_featurizers(speech_featurizer, text_featurizer) + +concrete_func = deepspeech2.make_tflite_function().get_concrete_function() +converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) +converter.experimental_new_converter = True +converter.optimizations = [tf.lite.Optimize.DEFAULT] +converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] +tflite_model = converter.convert() + +args.output = file_util.preprocess_paths(args.output) +with open(args.output, "wb") as tflite_out: + tflite_out.write(tflite_model) diff --git a/examples/deepspeech2/train.py b/examples/deepspeech2/train.py index 49e0b83d95..3f3e5972c5 100644 --- a/examples/deepspeech2/train.py +++ b/examples/deepspeech2/train.py @@ -14,28 +14,34 @@ import os import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy +from tensorflow_asr.utils import env_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Deep Speech 2 Training") +parser = argparse.ArgumentParser(prog="DeepSpeech2 Training") -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") +parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") +parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") + +parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance") + +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") + +parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") @@ -45,59 +51,72 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -strategy = setup_strategy(args.devices) +strategy = env_util.setup_strategy(args.devices) from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.keras.deepspeech2 import DeepSpeech2 +from tensorflow_asr.datasets import asr_dataset +from tensorflow_asr.featurizers import speech_featurizers, text_featurizers +from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2 config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) +speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) + +if args.sentence_piece: + print("Loading SentencePiece model ...") + text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Loading subwords ...") + text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) +else: + print("Use characters ...") + text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: - train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) + eval_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, + **vars(config.learning_config.eval_dataset_config), + indefinite=True ) - # Update metadata calculated from both train and eval datasets - train_dataset.load_metadata(args.metadata_prefix) - eval_dataset.load_metadata(args.metadata_prefix) - # Use dynamic length - speech_featurizer.reset_length() - text_featurizer.reset_length() else: - train_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + eval_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True ) -global_batch_size = config.learning_config.running_config.batch_size +train_dataset.load_metadata(args.metadata) +eval_dataset.load_metadata(args.metadata) + +if not args.static_length: + speech_featurizer.reset_length() + text_featurizer.reset_length() + +global_batch_size = args.tbs or config.learning_config.running_config.batch_size global_batch_size *= strategy.num_replicas_in_sync train_data_loader = train_dataset.create(global_batch_size) eval_data_loader = eval_dataset.create(global_batch_size) -# Build DS2 model with strategy.scope(): - ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) - ds2_model._build(speech_featurizer.shape) - ds2_model.summary(line_length=120) - - ds2_model.compile( + # build model + deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes) + deepspeech2._build(speech_featurizer.shape) + deepspeech2.summary(line_length=100) + deepspeech2.compile( optimizer=config.learning_config.optimizer_config, experimental_steps_per_execution=args.spx, global_batch_size=global_batch_size, @@ -110,8 +129,11 @@ tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard) ] -ds2_model.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps +deepspeech2.fit( + train_data_loader, + epochs=config.learning_config.running_config.num_epochs, + validation_data=eval_data_loader, + callbacks=callbacks, + steps_per_epoch=train_dataset.total_steps, + validation_steps=eval_dataset.total_steps ) diff --git a/examples/demonstration/conformer.py b/examples/demonstration/conformer.py index 7e0a280f53..1870a9777c 100644 --- a/examples/demonstration/conformer.py +++ b/examples/demonstration/conformer.py @@ -14,10 +14,9 @@ import os import argparse -from tensorflow_asr.utils import setup_environment, setup_devices -from tensorflow_asr.utils.utils import get_reduced_length +from tensorflow_asr.utils import env_util, math_util -setup_environment() +env_util.setup_environment() import tensorflow as tf parser = argparse.ArgumentParser(prog="Conformer non streaming") @@ -42,13 +41,13 @@ args = parser.parse_args() -setup_devices([args.device], cpu=args.cpu) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer, SentencePieceFeaturizer -from tensorflow_asr.models.conformer import Conformer +from tensorflow_asr.models.transducer.conformer import Conformer config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) @@ -71,7 +70,7 @@ signal = read_raw_audio(args.filename) features = speech_featurizer.tf_extract(signal) -input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor) +input_length = math_util.get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor) if args.beam_width: transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...]) diff --git a/examples/jasper/test.py b/examples/jasper/test.py index 48cabaf808..06e7c98ede 100644 --- a/examples/jasper/test.py +++ b/examples/jasper/test.py @@ -13,10 +13,11 @@ # limitations under the License. import os +from tqdm import tqdm import argparse -from tensorflow_asr.utils import setup_environment, setup_devices +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") @@ -25,58 +26,93 @@ parser = argparse.ArgumentParser(prog="Jasper Testing") -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--saved", type=str, default=None, help="Path to the model file to be exported") +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") +parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") +parser.add_argument("--bs", type=int, default=None, help="Test batch size") + +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") +parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") + +parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath") args = parser.parse_args() +assert args.saved + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -setup_devices([args.device]) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.jasper import Jasper - -tf.random.set_seed(0) -assert args.export +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer +from tensorflow_asr.models.ctc.jasper import Jasper +from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) -# Build DS2 model -jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) -jasper._build(speech_featurizer.shape) -jasper.load_weights(args.saved) -jasper.summary(line_length=120) -jasper.add_featurizers(speech_featurizer, text_featurizer) + +if args.sentence_piece: + print("Use SentencePiece ...") + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) +else: + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) + +tf.random.set_seed(0) if args.tfrecords: test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) else: test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) -ctc_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -ctc_tester.compile(jasper) -ctc_tester.run(test_dataset) +# build model +jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) +jasper._build(speech_featurizer.shape) +jasper.load_weights(args.saved) +jasper.summary(line_length=100) +jasper.add_featurizers(speech_featurizer, text_featurizer) + +batch_size = args.bs or config.learning_config.running_config.batch_size +test_data_loader = test_dataset.create(batch_size) + +with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: + overwrite = False + if tf.io.gfile.exists(filepath): + overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y" + if overwrite: + results = jasper.predict(test_data_loader, verbose=1) + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + progbar = tqdm(total=test_dataset.total_steps, unit="batch") + for i, pred in enumerate(results): + groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred] + path, duration, _ = test_dataset.entries[i] + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + progbar.update(1) + progbar.close() + app_util.evaluate_results(filepath) diff --git a/examples/jasper/tflite.py b/examples/jasper/tflite.py new file mode 100644 index 0000000000..962118e165 --- /dev/null +++ b/examples/jasper/tflite.py @@ -0,0 +1,69 @@ +# Copyright 2020 Huy Le Nguyen (@usimarit) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +from tensorflow_asr.utils import env_util, file_util + +env_util.setup_environment() +import tensorflow as tf + +from tensorflow_asr.configs.config import Config +from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer +from tensorflow_asr.models.ctc.jasper import Jasper + +DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") + +tf.keras.backend.clear_session() + +parser = argparse.ArgumentParser(prog="Jasper TFLite") + +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") + +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") + +parser.add_argument("--subwords", type=str, default=None, help="Use subwords") + +parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") + +args = parser.parse_args() + +assert args.saved and args.output + +config = Config(args.config) +speech_featurizer = TFSpeechFeaturizer(config.speech_config) + +if args.subwords: + text_featurizer = SubwordFeaturizer(config.decoder_config) +else: + text_featurizer = CharFeaturizer(config.decoder_config) + +# build model +jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) +jasper._build(speech_featurizer.shape) +jasper.load_weights(args.saved) +jasper.summary(line_length=100) +jasper.add_featurizers(speech_featurizer, text_featurizer) + +concrete_func = jasper.make_tflite_function().get_concrete_function() +converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) +converter.experimental_new_converter = True +converter.optimizations = [tf.lite.Optimize.DEFAULT] +converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] +tflite_model = converter.convert() + +args.output = file_util.preprocess_paths(args.output) +with open(args.output, "wb") as tflite_out: + tflite_out.write(tflite_model) diff --git a/examples/jasper/train.py b/examples/jasper/train.py index 444ca1314a..f27d63c066 100644 --- a/examples/jasper/train.py +++ b/examples/jasper/train.py @@ -14,9 +14,9 @@ import os import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy +from tensorflow_asr.utils import env_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") @@ -25,19 +25,23 @@ parser = argparse.ArgumentParser(prog="Jasper Training") -parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") +parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") -parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas") +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") -parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas") +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + +parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") + +parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") -parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset") +parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") @@ -47,57 +51,71 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -strategy = setup_strategy(args.devices) +strategy = env_util.setup_strategy(args.devices) from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer -from tensorflow_asr.models.keras.jasper import Jasper +from tensorflow_asr.datasets import asr_dataset +from tensorflow_asr.featurizers import speech_featurizers, text_featurizers +from tensorflow_asr.models.ctc.jasper import Jasper config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) -text_featurizer = CharFeaturizer(config.decoder_config) +speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) + +if args.sentence_piece: + print("Loading SentencePiece model ...") + text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Loading subwords ...") + text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) +else: + print("Use characters ...") + text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: - train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) + eval_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, + **vars(config.learning_config.eval_dataset_config), + indefinite=True ) - # Update metadata calculated from both train and eval datasets - train_dataset.load_metadata(args.metadata_prefix) - eval_dataset.load_metadata(args.metadata_prefix) - # Use dynamic length - speech_featurizer.reset_length() - text_featurizer.reset_length() else: - train_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + eval_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True ) -global_batch_size = config.learning_config.running_config.batch_size +train_dataset.load_metadata(args.metadata) +eval_dataset.load_metadata(args.metadata) + +if not args.static_length: + speech_featurizer.reset_length() + text_featurizer.reset_length() + +global_batch_size = args.tbs or config.learning_config.running_config.batch_size global_batch_size *= strategy.num_replicas_in_sync train_data_loader = train_dataset.create(global_batch_size) eval_data_loader = eval_dataset.create(global_batch_size) with strategy.scope(): + # build model jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes) jasper._build(speech_featurizer.shape) - jasper.summary(line_length=120) - + jasper.summary(line_length=100) jasper.compile( optimizer=config.learning_config.optimizer_config, experimental_steps_per_execution=args.spx, @@ -112,7 +130,10 @@ ] jasper.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps + train_data_loader, + epochs=config.learning_config.running_config.num_epochs, + validation_data=eval_data_loader, + callbacks=callbacks, + steps_per_epoch=train_dataset.total_steps, + validation_steps=eval_dataset.total_steps ) diff --git a/examples/rnn_transducer/test.py b/examples/rnn_transducer/test.py index 377ef291a0..724924ce40 100644 --- a/examples/rnn_transducer/test.py +++ b/examples/rnn_transducer/test.py @@ -13,17 +13,18 @@ # limitations under the License. import os +from tqdm import tqdm import argparse -from tensorflow_asr.utils import setup_environment, setup_devices +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Conformer Testing") +parser = argparse.ArgumentParser(prog="RnnTransducer Testing") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") @@ -33,63 +34,85 @@ parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") +parser.add_argument("--bs", type=int, default=None, help="Test batch size") + +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--device", type=int, default=0, help="Device's id to run test on") parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu") -parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") - -parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix") +parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath") args = parser.parse_args() +assert args.saved + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -setup_devices([args.device], cpu=args.cpu) +env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.runners.base_runners import BaseTester -from tensorflow_asr.models.streaming_transducer import StreamingTransducer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer +from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer +from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.sentence_piece: + print("Use SentencePiece ...") + text_featurizer = SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Use subwords ...") + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + print("Use characters ...") + text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) -assert args.saved if args.tfrecords: test_dataset = ASRTFRecordDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) else: test_dataset = ASRSliceDataset( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.test_dataset_config) ) # build model -streaming_transducer = StreamingTransducer( - vocabulary_size=text_featurizer.num_classes, - **config.model_config -) -streaming_transducer._build(speech_featurizer.shape) -streaming_transducer.load_weights(args.saved) -streaming_transducer.summary(line_length=150) -streaming_transducer.add_featurizers(speech_featurizer, text_featurizer) - -streaming_transducer_tester = BaseTester( - config=config.learning_config.running_config, - output_name=args.output_name -) -streaming_transducer_tester.compile(streaming_transducer) -streaming_transducer_tester.run(test_dataset) +rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes) +rnn_transducer._build(speech_featurizer.shape) +rnn_transducer.load_weights(args.saved) +rnn_transducer.summary(line_length=100) +rnn_transducer.add_featurizers(speech_featurizer, text_featurizer) + +batch_size = args.bs or config.learning_config.running_config.batch_size +test_data_loader = test_dataset.create(batch_size) + +with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath: + overwrite = False + if tf.io.gfile.exists(filepath): + overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y" + if overwrite: + results = rnn_transducer.predict(test_data_loader, verbose=1) + print(f"Saving result to {args.output} ...") + with open(filepath, "w") as openfile: + openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n") + progbar = tqdm(total=test_dataset.total_steps, unit="batch") + for i, pred in enumerate(results): + groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred] + path, duration, _ = test_dataset.entries[i] + openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n") + progbar.update(1) + progbar.close() + app_util.evaluate_results(filepath) diff --git a/examples/rnn_transducer/tflite.py b/examples/rnn_transducer/tflite.py index 254e56de8b..1d2092029d 100644 --- a/examples/rnn_transducer/tflite.py +++ b/examples/rnn_transducer/tflite.py @@ -14,33 +14,29 @@ import os import argparse -from tensorflow_asr.utils import setup_environment +from tensorflow_asr.utils import env_util, file_util -setup_environment() +env_util.setup_environment() import tensorflow as tf from tensorflow_asr.configs.config import Config from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer -from tensorflow_asr.models.streaming_transducer import StreamingTransducer +from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer +from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Conformer Testing") +parser = argparse.ArgumentParser(prog="RnnTransducer TFLite") -parser.add_argument("--config", type=str, default=DEFAULT_YAML, - help="The file path of model configuration file") +parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--saved", type=str, default=None, - help="Path to saved model") +parser.add_argument("--saved", type=str, default=None, help="Path to saved model") -parser.add_argument("--subwords", type=str, default=None, - help="Path to file that stores generated subwords") +parser.add_argument("--subwords", type=str, default=None, help="Use subwords") -parser.add_argument("output", type=str, default=None, - help="TFLite file path to be exported") +parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported") args = parser.parse_args() @@ -49,30 +45,25 @@ config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) -if args.subwords and os.path.exists(args.subwords): - print("Loading subwords ...") - text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords) +if args.subwords: + text_featurizer = SubwordFeaturizer(config.decoder_config) else: - raise ValueError("subwords must be set") + text_featurizer = CharFeaturizer(config.decoder_config) # build model -streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes -) -streaming_transducer._build(speech_featurizer.shape) -streaming_transducer.load_weights(args.saved) -streaming_transducer.summary(line_length=150) -streaming_transducer.add_featurizers(speech_featurizer, text_featurizer) - -concrete_func = streaming_transducer.make_tflite_function().get_concrete_function() +rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes) +rnn_transducer._build(speech_featurizer.shape) +rnn_transducer.load_weights(args.saved) +rnn_transducer.summary(line_length=100) +rnn_transducer.add_featurizers(speech_featurizer, text_featurizer) + +concrete_func = rnn_transducer.make_tflite_function().get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) +converter.experimental_new_converter = True converter.optimizations = [tf.lite.Optimize.DEFAULT] -converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, - tf.lite.OpsSet.SELECT_TF_OPS] +converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] tflite_model = converter.convert() -if not os.path.exists(os.path.dirname(args.output)): - os.makedirs(os.path.dirname(args.output)) +args.output = file_util.preprocess_paths(args.output) with open(args.output, "wb") as tflite_out: tflite_out.write(tflite_model) diff --git a/examples/rnn_transducer/train.py b/examples/rnn_transducer/train.py index 6f7c92c643..a35f7f2801 100644 --- a/examples/rnn_transducer/train.py +++ b/examples/rnn_transducer/train.py @@ -13,89 +13,101 @@ # limitations under the License. import os +import math import argparse -from tensorflow_asr.utils import setup_environment, setup_strategy +from tensorflow_asr.utils import env_util -setup_environment() +env_util.setup_environment() import tensorflow as tf DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml") tf.keras.backend.clear_session() -parser = argparse.ArgumentParser(prog="Conformer Training") +parser = argparse.ArgumentParser(prog="RnnTransducer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") -parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") - parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords") +parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") + +parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords") + parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") + +parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") -parser.add_argument("--subword", default=False, action="store_true", help="Use subword") - args = parser.parse_args() tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) -strategy = setup_strategy(args.devices) +strategy = env_util.setup_strategy(args.devices) from tensorflow_asr.configs.config import Config -from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras -from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer -from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer -from tensorflow_asr.models.keras.streaming_transducer import StreamingTransducer +from tensorflow_asr.datasets import asr_dataset +from tensorflow_asr.featurizers import speech_featurizers, text_featurizers +from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer +from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) -speech_featurizer = TFSpeechFeaturizer(config.speech_config) - -if args.subword: - print("Use subwords ...") - text_featurizer = SubwordFeaturizer(config.decoder_config) +speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) + +if args.sentence_piece: + print("Loading SentencePiece model ...") + text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config) +elif args.subwords: + print("Loading subwords ...") + text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config) else: print("Use characters ...") - text_featurizer = CharFeaturizer(config.decoder_config) + text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config) if args.tfrecords: - train_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRTFRecordDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, - **vars(config.learning_config.eval_dataset_config) + eval_dataset = asr_dataset.ASRTFRecordDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, + **vars(config.learning_config.eval_dataset_config), + indefinite=True ) - # Update metadata calculated from both train and eval datasets - train_dataset.load_metadata(args.metadata_prefix) - eval_dataset.load_metadata(args.metadata_prefix) - # Use dynamic length - speech_featurizer.reset_length() - text_featurizer.reset_length() else: - train_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + train_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.train_dataset_config), indefinite=True ) - eval_dataset = ASRSliceDatasetKeras( - speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, + eval_dataset = asr_dataset.ASRSliceDataset( + speech_featurizer=speech_featurizer, + text_featurizer=text_featurizer, **vars(config.learning_config.eval_dataset_config), indefinite=True ) -global_batch_size = config.learning_config.running_config.batch_size +train_dataset.load_metadata(args.metadata) +eval_dataset.load_metadata(args.metadata) + +if not args.static_length: + speech_featurizer.reset_length() + text_featurizer.reset_length() + +global_batch_size = args.tbs or config.learning_config.running_config.batch_size global_batch_size *= strategy.num_replicas_in_sync train_data_loader = train_dataset.create(global_batch_size) @@ -103,16 +115,20 @@ with strategy.scope(): # build model - streaming_transducer = StreamingTransducer( - **config.model_config, - vocabulary_size=text_featurizer.num_classes + rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes) + rnn_transducer._build(speech_featurizer.shape) + rnn_transducer.summary(line_length=100) + + optimizer = tf.keras.optimizers.Adam( + TransformerSchedule( + d_model=rnn_transducer.dmodel, + warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000), + max_lr=(0.05 / math.sqrt(rnn_transducer.dmodel)) + ), + **config.learning_config.optimizer_config ) - streaming_transducer._build(speech_featurizer.shape) - streaming_transducer.summary(line_length=150) - - optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config) - streaming_transducer.compile( + rnn_transducer.compile( optimizer=optimizer, experimental_steps_per_execution=args.spx, global_batch_size=global_batch_size, @@ -125,8 +141,11 @@ tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard) ] -streaming_transducer.fit( - train_data_loader, epochs=config.learning_config.running_config.num_epochs, - validation_data=eval_data_loader, callbacks=callbacks, - steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps +rnn_transducer.fit( + train_data_loader, + epochs=config.learning_config.running_config.num_epochs, + validation_data=eval_data_loader, + callbacks=callbacks, + steps_per_epoch=train_dataset.total_steps, + validation_steps=eval_dataset.total_steps ) From d0284895bfe33a21edc2fa0d256423f236e3c9eb Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 17 Apr 2021 14:01:30 +0700 Subject: [PATCH 11/13] :writing_hand: update contextnet and init notebook examples --- README.md | 1 + examples/contextnet/config.yml | 29 +++++++------------ notebooks/conformer.ipynb | 0 notebooks/contextnet.ipynb | 0 notebooks/deepspeech2.ipynb | 0 notebooks/jasper.ipynb | 0 tensorflow_asr/models/base_model.py | 8 +++-- tensorflow_asr/models/ctc/ctc.py | 4 --- .../models/transducer/contextnet.py | 14 +++++---- 9 files changed, 26 insertions(+), 30 deletions(-) create mode 100644 notebooks/conformer.ipynb create mode 100644 notebooks/contextnet.ipynb create mode 100644 notebooks/deepspeech2.ipynb create mode 100644 notebooks/jasper.ipynb diff --git a/README.md b/README.md index df80c93f7e..36da89c78b 100755 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as ## What's New? +- (04/17/2021) Refactor repository with new version 1.x - (02/16/2021) Supported for TPU training - (12/27/2020) Supported _naive_ token level timestamp, see [demo](./examples/demonstration/conformer.py) with flag `--timestamp` - (12/17/2020) Supported ContextNet [http://arxiv.org/abs/2005.03191](http://arxiv.org/abs/2005.03191) diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml index 24b2f17e9d..790c0e5a38 100644 --- a/examples/contextnet/config.yml +++ b/examples/contextnet/config.yml @@ -207,8 +207,8 @@ learning_config: num_masks: 1 mask_factor: 27 data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + tfrecords_dir: null shuffle: True cache: True buffer_size: 100 @@ -217,10 +217,8 @@ learning_config: eval_dataset_config: use_tf: True - data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + data_paths: null + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -230,8 +228,8 @@ learning_config: test_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/e/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -240,26 +238,21 @@ learning_config: optimizer_config: warmup_steps: 40000 - beta1: 0.9 - beta2: 0.98 + beta_1: 0.9 + beta_2: 0.98 epsilon: 1e-9 running_config: batch_size: 2 - accumulation_steps: 4 num_epochs: 20 - outdir: /mnt/Miscellanea/Models/local/contextnet - log_interval_steps: 300 - eval_interval_steps: 500 - save_interval_steps: 1000 checkpoint: - filepath: /mnt/Miscellanea/Models/local/contextnet/checkpoints/{epoch:02d}.h5 + filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5 save_best_only: True save_weights_only: False save_freq: epoch - states_dir: /mnt/Miscellanea/Models/local/contextnet/states + states_dir: /mnt/e/Models/local/contextnet/states tensorboard: - log_dir: /mnt/Miscellanea/Models/local/contextnet/tensorboard + log_dir: /mnt/e/Models/local/contextnet/tensorboard histogram_freq: 1 write_graph: True write_images: True diff --git a/notebooks/conformer.ipynb b/notebooks/conformer.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/notebooks/contextnet.ipynb b/notebooks/contextnet.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/notebooks/deepspeech2.ipynb b/notebooks/deepspeech2.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/notebooks/jasper.ipynb b/notebooks/jasper.ipynb new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py index 1ebf8787e5..39ddbe4693 100644 --- a/tensorflow_asr/models/base_model.py +++ b/tensorflow_asr/models/base_model.py @@ -95,10 +95,12 @@ def train_step(self, batch): y_pred = self(inputs, training=True) loss = self.loss(y_true, y_pred) if self.use_loss_scale: - loss = self.optimizer.get_scaled_loss(loss) - gradients = tape.gradient(loss, self.trainable_weights) + scaled_loss = self.optimizer.get_scaled_loss(loss) if self.use_loss_scale: + gradients = tape.gradient(scaled_loss, self.trainable_weights) gradients = self.optimizer.get_unscaled_gradients(gradients) + else: + gradients = tape.gradient(loss, self.trainable_weights) self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) self._metrics["loss"].update_state(loss) return {m.name: m.result() for m in self.metrics} @@ -127,6 +129,8 @@ def predict_step(self, batch): beam_search_decoding = self.recognize_beam(inputs) return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1) + # -------------------------------- INFERENCE FUNCTIONS ------------------------------------- + def recognize(self, features, input_lengths, **kwargs): pass diff --git a/tensorflow_asr/models/ctc/ctc.py b/tensorflow_asr/models/ctc/ctc.py index a30c0e166e..0166e3ead7 100644 --- a/tensorflow_asr/models/ctc/ctc.py +++ b/tensorflow_asr/models/ctc/ctc.py @@ -38,10 +38,6 @@ def __init__(self, self.decoder = decoder self.time_reduction_factor = 1 - @property - def metrics(self): - return [self.loss_metric] - def _build(self, input_shape, batch_size=None): inputs = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32) inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32) diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py index bc81f17fe7..134a06a8f2 100644 --- a/tensorflow_asr/models/transducer/contextnet.py +++ b/tensorflow_asr/models/transducer/contextnet.py @@ -17,7 +17,7 @@ from ..encoders.contextnet import ContextNetEncoder, L2 from .transducer import Transducer -from ...utils import math_util +from ...utils import math_util, data_util class ContextNet(Transducer): @@ -80,11 +80,13 @@ def __init__(self, for block in self.encoder.blocks: self.time_reduction_factor *= block.time_reduction_factor def call(self, inputs, training=False, **kwargs): - features, input_length, prediction, prediction_length = inputs - enc = self.encoder([features, input_length], training=training, **kwargs) - pred = self.predict_net([prediction, prediction_length], training=training, **kwargs) - outputs = self.joint_net([enc, pred], training=training, **kwargs) - return outputs + enc = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=training, **kwargs) + pred = self.predict_net([inputs["predictions"], inputs["predictions_length"]], training=training, **kwargs) + logits = self.joint_net([enc, pred], training=training, **kwargs) + return data_util.create_logits( + logits=logits, + logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) + ) def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor): with tf.name_scope(f"{self.name}_encoder"): From 4be4a6e062c744dffb3fd392d5c96a6003d6a031 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sat, 17 Apr 2021 18:07:08 +0700 Subject: [PATCH 12/13] :writing_hand: test and update train script --- examples/conformer/config.yml | 2 +- examples/contextnet/config.yml | 4 ++-- examples/deepspeech2/config.yml | 27 ++++++++++----------------- examples/jasper/config.yml | 27 ++++++++++----------------- examples/rnn_transducer/config.yml | 27 ++++++++++----------------- examples/rnn_transducer/train.py | 14 +------------- tensorflow_asr/models/ctc/jasper.py | 2 +- tensorflow_asr/utils/data_util.py | 9 ++++++--- tensorflow_asr/utils/env_util.py | 2 ++ 9 files changed, 43 insertions(+), 71 deletions(-) diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml index 0ee6487e98..9ab5255c1f 100755 --- a/examples/conformer/config.yml +++ b/examples/conformer/config.yml @@ -115,7 +115,7 @@ learning_config: checkpoint: filepath: /mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5 save_best_only: True - save_weights_only: False + save_weights_only: True save_freq: epoch states_dir: /mnt/e/Models/local/conformer/states tensorboard: diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml index 790c0e5a38..c0b9f24dd1 100644 --- a/examples/contextnet/config.yml +++ b/examples/contextnet/config.yml @@ -228,7 +228,7 @@ learning_config: test_dataset_config: use_tf: True data_paths: - - /mnt/e/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv tfrecords_dir: null shuffle: False cache: True @@ -248,7 +248,7 @@ learning_config: checkpoint: filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5 save_best_only: True - save_weights_only: False + save_weights_only: True save_freq: epoch states_dir: /mnt/e/Models/local/contextnet/states tensorboard: diff --git a/examples/deepspeech2/config.yml b/examples/deepspeech2/config.yml index 68a77d7bd4..cbc8ad65ef 100755 --- a/examples/deepspeech2/config.yml +++ b/examples/deepspeech2/config.yml @@ -52,8 +52,8 @@ learning_config: train_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + tfrecords_dir: null shuffle: True cache: True buffer_size: 100 @@ -62,10 +62,8 @@ learning_config: eval_dataset_config: use_tf: True - data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + data_paths: null + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -75,8 +73,8 @@ learning_config: test_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -91,19 +89,14 @@ learning_config: running_config: batch_size: 4 num_epochs: 20 - accumulation_steps: 8 - outdir: /mnt/Miscellanea/Models/local/deepspeech2 - log_interval_steps: 400 - save_interval_steps: 400 - eval_interval_steps: 800 checkpoint: - filepath: /mnt/Miscellanea/Models/local/deepspeech2/checkpoints/{epoch:02d}.h5 + filepath: /mnt/e/Models/local/deepspeech2/checkpoints/{epoch:02d}.h5 save_best_only: True - save_weights_only: False + save_weights_only: True save_freq: epoch - states_dir: /mnt/Miscellanea/Models/local/deepspeech2/states + states_dir: /mnt/e/Models/local/deepspeech2/states tensorboard: - log_dir: /mnt/Miscellanea/Models/local/deepspeech2/tensorboard + log_dir: /mnt/e/Models/local/deepspeech2/tensorboard histogram_freq: 1 write_graph: True write_images: True diff --git a/examples/jasper/config.yml b/examples/jasper/config.yml index f6c158edce..0b16fdec89 100755 --- a/examples/jasper/config.yml +++ b/examples/jasper/config.yml @@ -59,8 +59,8 @@ learning_config: train_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + tfrecords_dir: null shuffle: True cache: True buffer_size: 100 @@ -69,10 +69,8 @@ learning_config: eval_dataset_config: use_tf: True - data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + data_paths: null + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -82,8 +80,8 @@ learning_config: test_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -98,19 +96,14 @@ learning_config: running_config: batch_size: 4 num_epochs: 20 - accumulation_steps: 8 - outdir: /mnt/Miscellanea/Models/local/jasper - log_interval_steps: 400 - save_interval_steps: 400 - eval_interval_steps: 800 checkpoint: - filepath: /mnt/Miscellanea/Models/local/jasper/checkpoints/{epoch:02d}.h5 + filepath: /mnt/e/Models/local/jasper/checkpoints/{epoch:02d}.h5 save_best_only: True - save_weights_only: False + save_weights_only: True save_freq: epoch - states_dir: /mnt/Miscellanea/Models/local/jasper/states + states_dir: /mnt/e/Models/local/jasper/states tensorboard: - log_dir: /mnt/Miscellanea/Models/local/jasper/tensorboard + log_dir: /mnt/e/Models/local/jasper/tensorboard histogram_freq: 1 write_graph: True write_images: True diff --git a/examples/rnn_transducer/config.yml b/examples/rnn_transducer/config.yml index 8acfee4f92..4efbb11024 100644 --- a/examples/rnn_transducer/config.yml +++ b/examples/rnn_transducer/config.yml @@ -64,8 +64,8 @@ learning_config: num_masks: 1 mask_factor: 27 data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv + tfrecords_dir: null shuffle: True cache: True buffer_size: 100 @@ -74,10 +74,8 @@ learning_config: eval_dataset_config: use_tf: True - data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + data_paths: null + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -87,8 +85,8 @@ learning_config: test_dataset_config: use_tf: True data_paths: - - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv - tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords + - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv + tfrecords_dir: null shuffle: False cache: True buffer_size: 100 @@ -102,20 +100,15 @@ learning_config: running_config: batch_size: 2 - accumulation_steps: 1 num_epochs: 20 - outdir: /mnt/Miscellanea/Models/local/streaming_transducer - log_interval_steps: 300 - eval_interval_steps: 500 - save_interval_steps: 1000 checkpoint: - filepath: /mnt/Miscellanea/Models/local/streaming_transducer/checkpoints/{epoch:02d}.h5 + filepath: /mnt/e/Models/local/rnn_transducer/checkpoints/{epoch:02d}.h5 save_best_only: True - save_weights_only: False + save_weights_only: True save_freq: epoch - states_dir: /mnt/Miscellanea/Models/local/streaming_transducer/states + states_dir: /mnt/e/Models/local/rnn_transducer/states tensorboard: - log_dir: /mnt/Miscellanea/Models/local/streaming_transducer/tensorboard + log_dir: /mnt/e/Models/local/rnn_transducer/tensorboard histogram_freq: 1 write_graph: True write_images: True diff --git a/examples/rnn_transducer/train.py b/examples/rnn_transducer/train.py index a35f7f2801..4e3eff4ba4 100644 --- a/examples/rnn_transducer/train.py +++ b/examples/rnn_transducer/train.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import math import argparse from tensorflow_asr.utils import env_util @@ -58,7 +57,6 @@ from tensorflow_asr.datasets import asr_dataset from tensorflow_asr.featurizers import speech_featurizers, text_featurizers from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer -from tensorflow_asr.optimizers.schedules import TransformerSchedule config = Config(args.config) speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config) @@ -118,18 +116,8 @@ rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes) rnn_transducer._build(speech_featurizer.shape) rnn_transducer.summary(line_length=100) - - optimizer = tf.keras.optimizers.Adam( - TransformerSchedule( - d_model=rnn_transducer.dmodel, - warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000), - max_lr=(0.05 / math.sqrt(rnn_transducer.dmodel)) - ), - **config.learning_config.optimizer_config - ) - rnn_transducer.compile( - optimizer=optimizer, + optimizer=config.learning_config.optimizer_config, experimental_steps_per_execution=args.spx, global_batch_size=global_batch_size, blank=text_featurizer.blank diff --git a/tensorflow_asr/models/ctc/jasper.py b/tensorflow_asr/models/ctc/jasper.py index 963391a7bb..23b47ed063 100644 --- a/tensorflow_asr/models/ctc/jasper.py +++ b/tensorflow_asr/models/ctc/jasper.py @@ -357,7 +357,7 @@ def __init__(self, strides=1, padding="same", kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, - name=f"{self.name}_logits" + name=f"{name}_logits" ), vocabulary_size=vocabulary_size, name=name, diff --git a/tensorflow_asr/utils/data_util.py b/tensorflow_asr/utils/data_util.py index 324c720d49..2bcdca8d4e 100644 --- a/tensorflow_asr/utils/data_util.py +++ b/tensorflow_asr/utils/data_util.py @@ -21,12 +21,15 @@ def create_inputs(inputs: tf.Tensor, inputs_length: tf.Tensor, predictions: tf.Tensor = None, predictions_length: tf.Tensor = None) -> dict: - return { + data = { "inputs": inputs, "inputs_length": inputs_length, - "predictions": predictions, - "predictions_length": predictions_length } + if predictions is not None: + data["predictions"] = predictions + if predictions_length is not None: + data["predictions_length"] = predictions_length + return data def create_logits(logits: tf.Tensor, logits_length: tf.Tensor) -> dict: diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py index c5564b543e..8073b20eee 100644 --- a/tensorflow_asr/utils/env_util.py +++ b/tensorflow_asr/utils/env_util.py @@ -49,6 +49,8 @@ def setup_strategy(devices): tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus """ setup_devices(devices) + if has_tpu(): + return setup_tpu() return tf.distribute.MirroredStrategy() From 11d6afcb1ec42caaa5b39451b8b724514dd11837 Mon Sep 17 00:00:00 2001 From: Huy Le Nguyen Date: Sun, 18 Apr 2021 00:34:02 +0700 Subject: [PATCH 13/13] :writing_hand: update dataset and add notebooks --- notebooks/conformer.ipynb | 269 +++++++++++++++ notebooks/contextnet.ipynb | 433 +++++++++++++++++++++++++ notebooks/deepspeech2.ipynb | 0 notebooks/jasper.ipynb | 0 notebooks/rnn_transducer.ipynb | 237 ++++++++++++++ scripts/create_vocab_from_trans.py | 6 +- scripts/generate_metadata.py | 4 +- tensorflow_asr/configs/config.py | 5 +- tensorflow_asr/datasets/asr_dataset.py | 36 +- 9 files changed, 966 insertions(+), 24 deletions(-) delete mode 100644 notebooks/deepspeech2.ipynb delete mode 100644 notebooks/jasper.ipynb create mode 100644 notebooks/rnn_transducer.ipynb diff --git a/notebooks/conformer.ipynb b/notebooks/conformer.ipynb index e69de29bb2..911da8606f 100644 --- a/notebooks/conformer.ipynb +++ b/notebooks/conformer.ipynb @@ -0,0 +1,269 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f", + "display_name": "Python 3.8.8 64-bit ('tfo': conda)" + }, + "metadata": { + "interpreter": { + "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"speech_config\": {\n", + " \"sample_rate\": 16000,\n", + " \"frame_ms\": 25,\n", + " \"stride_ms\": 10,\n", + " \"num_feature_bins\": 80,\n", + " \"feature_type\": \"log_mel_spectrogram\",\n", + " \"preemphasis\": 0.97,\n", + " \"normalize_signal\": True,\n", + " \"normalize_feature\": True,\n", + " \"normalize_per_feature\": False,\n", + " },\n", + " \"decoder_config\": {\n", + " \"vocabulary\": None,\n", + " \"target_vocab_size\": 1000,\n", + " \"max_subword_length\": 10,\n", + " \"blank_at_zero\": True,\n", + " \"beam_width\": 0,\n", + " \"norm_score\": True,\n", + " \"corpus_files\": None,\n", + " },\n", + " \"model_config\": {\n", + " \"name\": \"conformer\",\n", + " \"encoder_subsampling\": {\n", + " \"type\": \"conv2d\",\n", + " \"filters\": 144,\n", + " \"kernel_size\": 3,\n", + " \"strides\": 2,\n", + " },\n", + " \"encoder_positional_encoding\": \"sinusoid_concat\",\n", + " \"encoder_dmodel\": 144,\n", + " \"encoder_num_blocks\": 16,\n", + " \"encoder_head_size\": 36,\n", + " \"encoder_num_heads\": 4,\n", + " \"encoder_mha_type\": \"relmha\",\n", + " \"encoder_kernel_size\": 32,\n", + " \"encoder_fc_factor\": 0.5,\n", + " \"encoder_dropout\": 0.1,\n", + " \"prediction_embed_dim\": 320,\n", + " \"prediction_embed_dropout\": 0,\n", + " \"prediction_num_rnns\": 1,\n", + " \"prediction_rnn_units\": 320,\n", + " \"prediction_rnn_type\": \"lstm\",\n", + " \"prediction_rnn_implementation\": 2,\n", + " \"prediction_layer_norm\": True,\n", + " \"prediction_projection_units\": 0,\n", + " \"joint_dim\": 320,\n", + " \"prejoint_linear\": True,\n", + " \"joint_activation\": \"tanh\",\n", + " \"joint_mode\": \"add\",\n", + " },\n", + " \"learning_config\": {\n", + " \"train_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"augmentation_config\": {\n", + " \"feature_augment\": {\n", + " \"time_masking\": {\n", + " \"num_masks\": 10,\n", + " \"mask_factor\": 100,\n", + " \"p_upperbound\": 0.05,\n", + " },\n", + " \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n", + " }\n", + " },\n", + " \"data_paths\": [\n", + " \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n", + " ],\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": True,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"train\",\n", + " },\n", + " \"eval_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": None,\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"eval\",\n", + " },\n", + " \"test_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": None,\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"test\",\n", + " },\n", + " \"optimizer_config\": {\n", + " \"warmup_steps\": 40000,\n", + " \"beta_1\": 0.9,\n", + " \"beta_2\": 0.98,\n", + " \"epsilon\": 1e-09,\n", + " },\n", + " \"running_config\": {\n", + " \"batch_size\": 2,\n", + " \"num_epochs\": 50,\n", + " \"checkpoint\": {\n", + " \"filepath\": \"/mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5\",\n", + " \"save_best_only\": True,\n", + " \"save_weights_only\": True,\n", + " \"save_freq\": \"epoch\",\n", + " },\n", + " \"states_dir\": \"/mnt/e/Models/local/conformer/states\",\n", + " \"tensorboard\": {\n", + " \"log_dir\": \"/mnt/e/Models/local/conformer/tensorboard\",\n", + " \"histogram_freq\": 1,\n", + " \"write_graph\": True,\n", + " \"write_images\": True,\n", + " \"update_freq\": \"epoch\",\n", + " \"profile_batch\": 2,\n", + " },\n", + " },\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metadata = {\n", + " \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n", + " \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import math\n", + "import argparse\n", + "from tensorflow_asr.utils import env_util\n", + "\n", + "env_util.setup_environment()\n", + "import tensorflow as tf\n", + "\n", + "tf.keras.backend.clear_session()\n", + "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n", + "strategy = env_util.setup_strategy([0])\n", + "\n", + "from tensorflow_asr.configs.config import Config\n", + "from tensorflow_asr.datasets import asr_dataset\n", + "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n", + "from tensorflow_asr.models.transducer.conformer import Conformer\n", + "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n", + "\n", + "config = Config(config)\n", + "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n", + "\n", + "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n", + "\n", + "train_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.train_dataset_config),\n", + " indefinite=True\n", + ")\n", + "eval_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.eval_dataset_config),\n", + " indefinite=True\n", + ")\n", + "\n", + "train_dataset.load_metadata(metadata)\n", + "eval_dataset.load_metadata(metadata)\n", + "speech_featurizer.reset_length()\n", + "text_featurizer.reset_length()\n", + "\n", + "global_batch_size = config.learning_config.running_config.batch_size\n", + "global_batch_size *= strategy.num_replicas_in_sync\n", + "\n", + "train_data_loader = train_dataset.create(global_batch_size)\n", + "eval_data_loader = eval_dataset.create(global_batch_size)\n", + "\n", + "with strategy.scope():\n", + " # build model\n", + " conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n", + " conformer._build(speech_featurizer.shape)\n", + " conformer.summary(line_length=100)\n", + "\n", + " optimizer = tf.keras.optimizers.Adam(\n", + " TransformerSchedule(\n", + " d_model=conformer.dmodel,\n", + " warmup_steps=config.learning_config.optimizer_config.pop(\"warmup_steps\", 10000),\n", + " max_lr=(0.05 / math.sqrt(conformer.dmodel))\n", + " ),\n", + " **config.learning_config.optimizer_config\n", + " )\n", + "\n", + " conformer.compile(\n", + " optimizer=optimizer,\n", + " experimental_steps_per_execution=10,\n", + " global_batch_size=global_batch_size,\n", + " blank=text_featurizer.blank\n", + " )\n", + "\n", + "callbacks = [\n", + " tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n", + " tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n", + " tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n", + "]\n", + "\n", + "conformer.fit(\n", + " train_data_loader,\n", + " epochs=config.learning_config.running_config.num_epochs,\n", + " validation_data=eval_data_loader,\n", + " callbacks=callbacks,\n", + " steps_per_epoch=train_dataset.total_steps,\n", + " validation_steps=eval_dataset.total_steps\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/contextnet.ipynb b/notebooks/contextnet.ipynb index e69de29bb2..22efd1ca29 100644 --- a/notebooks/contextnet.ipynb +++ b/notebooks/contextnet.ipynb @@ -0,0 +1,433 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f", + "display_name": "Python 3.8.8 64-bit ('tfo': conda)" + }, + "metadata": { + "interpreter": { + "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"speech_config\": {\n", + " \"sample_rate\": 16000,\n", + " \"frame_ms\": 25,\n", + " \"stride_ms\": 10,\n", + " \"num_feature_bins\": 80,\n", + " \"feature_type\": \"log_mel_spectrogram\",\n", + " \"preemphasis\": 0.97,\n", + " \"normalize_signal\": True,\n", + " \"normalize_feature\": True,\n", + " \"normalize_per_feature\": False,\n", + " },\n", + " \"decoder_config\": {\n", + " \"vocabulary\": None,\n", + " \"target_vocab_size\": 1024,\n", + " \"max_subword_length\": 4,\n", + " \"blank_at_zero\": True,\n", + " \"beam_width\": 5,\n", + " \"norm_score\": True,\n", + " },\n", + " \"model_config\": {\n", + " \"name\": \"contextnet\",\n", + " \"encoder_alpha\": 0.5,\n", + " \"encoder_blocks\": [\n", + " {\n", + " \"nlayers\": 1,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": False,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 2,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 2,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 256,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 2,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 5,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 512,\n", + " \"strides\": 1,\n", + " \"residual\": True,\n", + " \"activation\": \"silu\",\n", + " },\n", + " {\n", + " \"nlayers\": 1,\n", + " \"kernel_size\": 5,\n", + " \"filters\": 640,\n", + " \"strides\": 1,\n", + " \"residual\": False,\n", + " \"activation\": \"silu\",\n", + " },\n", + " ],\n", + " \"prediction_embed_dim\": 640,\n", + " \"prediction_embed_dropout\": 0,\n", + " \"prediction_num_rnns\": 1,\n", + " \"prediction_rnn_units\": 640,\n", + " \"prediction_rnn_type\": \"lstm\",\n", + " \"prediction_rnn_implementation\": 1,\n", + " \"prediction_layer_norm\": True,\n", + " \"prediction_projection_units\": 0,\n", + " \"joint_dim\": 640,\n", + " \"joint_activation\": \"tanh\",\n", + " },\n", + " \"learning_config\": {\n", + " \"train_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"augmentation_config\": {\n", + " \"feature_augment\": {\n", + " \"time_masking\": {\n", + " \"num_masks\": 10,\n", + " \"mask_factor\": 100,\n", + " \"p_upperbound\": 0.05,\n", + " },\n", + " \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n", + " }\n", + " },\n", + " \"data_paths\": [\n", + " \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n", + " ],\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": True,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"train\",\n", + " },\n", + " \"eval_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": None,\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"eval\",\n", + " },\n", + " \"test_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": [\n", + " \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv\"\n", + " ],\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"test\",\n", + " },\n", + " \"optimizer_config\": {\n", + " \"warmup_steps\": 40000,\n", + " \"beta_1\": 0.9,\n", + " \"beta_2\": 0.98,\n", + " \"epsilon\": 1e-09,\n", + " },\n", + " \"running_config\": {\n", + " \"batch_size\": 2,\n", + " \"num_epochs\": 20,\n", + " \"checkpoint\": {\n", + " \"filepath\": \"/mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5\",\n", + " \"save_best_only\": True,\n", + " \"save_weights_only\": True,\n", + " \"save_freq\": \"epoch\",\n", + " },\n", + " \"states_dir\": \"/mnt/e/Models/local/contextnet/states\",\n", + " \"tensorboard\": {\n", + " \"log_dir\": \"/mnt/e/Models/local/contextnet/tensorboard\",\n", + " \"histogram_freq\": 1,\n", + " \"write_graph\": True,\n", + " \"write_images\": True,\n", + " \"update_freq\": \"epoch\",\n", + " \"profile_batch\": 2,\n", + " },\n", + " },\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metadata = {\n", + " \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n", + " \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import math\n", + "import argparse\n", + "from tensorflow_asr.utils import env_util\n", + "\n", + "env_util.setup_environment()\n", + "import tensorflow as tf\n", + "\n", + "tf.keras.backend.clear_session()\n", + "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n", + "strategy = env_util.setup_strategy([0])\n", + "\n", + "from tensorflow_asr.configs.config import Config\n", + "from tensorflow_asr.datasets import asr_dataset\n", + "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n", + "from tensorflow_asr.models.transducer.contextnet import ContextNet\n", + "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n", + "\n", + "config = Config(config)\n", + "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n", + "\n", + "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n", + "\n", + "train_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.train_dataset_config),\n", + " indefinite=True\n", + ")\n", + "eval_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.eval_dataset_config),\n", + " indefinite=True\n", + ")\n", + "\n", + "train_dataset.load_metadata(metadata)\n", + "eval_dataset.load_metadata(metadata)\n", + "speech_featurizer.reset_length()\n", + "text_featurizer.reset_length()\n", + "\n", + "global_batch_size = config.learning_config.running_config.batch_size\n", + "global_batch_size *= strategy.num_replicas_in_sync\n", + "\n", + "train_data_loader = train_dataset.create(global_batch_size)\n", + "eval_data_loader = eval_dataset.create(global_batch_size)\n", + "\n", + "with strategy.scope():\n", + " # build model\n", + " contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n", + " contextnet._build(speech_featurizer.shape)\n", + " contextnet.summary(line_length=100)\n", + "\n", + " optimizer = tf.keras.optimizers.Adam(\n", + " TransformerSchedule(\n", + " d_model=contextnet.dmodel,\n", + " warmup_steps=config.learning_config.optimizer_config.pop(\"warmup_steps\", 10000),\n", + " max_lr=(0.05 / math.sqrt(contextnet.dmodel))\n", + " ),\n", + " **config.learning_config.optimizer_config\n", + " )\n", + "\n", + " contextnet.compile(\n", + " optimizer=optimizer,\n", + " experimental_steps_per_execution=10,\n", + " global_batch_size=global_batch_size,\n", + " blank=text_featurizer.blank\n", + " )\n", + "\n", + "callbacks = [\n", + " tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n", + " tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n", + " tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n", + "]\n", + "\n", + "contextnet.fit(\n", + " train_data_loader,\n", + " epochs=config.learning_config.running_config.num_epochs,\n", + " validation_data=eval_data_loader,\n", + " callbacks=callbacks,\n", + " steps_per_epoch=train_dataset.total_steps,\n", + " validation_steps=eval_dataset.total_steps\n", + ")" + ] + } + ] +} \ No newline at end of file diff --git a/notebooks/deepspeech2.ipynb b/notebooks/deepspeech2.ipynb deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/notebooks/jasper.ipynb b/notebooks/jasper.ipynb deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/notebooks/rnn_transducer.ipynb b/notebooks/rnn_transducer.ipynb new file mode 100644 index 0000000000..efa97dc3fd --- /dev/null +++ b/notebooks/rnn_transducer.ipynb @@ -0,0 +1,237 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f", + "display_name": "Python 3.8.8 64-bit ('tfo': conda)" + }, + "metadata": { + "interpreter": { + "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"speech_config\": {\n", + " \"sample_rate\": 16000,\n", + " \"frame_ms\": 25,\n", + " \"stride_ms\": 10,\n", + " \"num_feature_bins\": 80,\n", + " \"feature_type\": \"log_mel_spectrogram\",\n", + " \"preemphasis\": 0.97,\n", + " \"normalize_signal\": True,\n", + " \"normalize_feature\": True,\n", + " \"normalize_per_feature\": False,\n", + " },\n", + " \"decoder_config\": {\n", + " \"vocabulary\": None,\n", + " \"target_vocab_size\": 1024,\n", + " \"max_subword_length\": 4,\n", + " \"blank_at_zero\": True,\n", + " \"beam_width\": 5,\n", + " \"norm_score\": True,\n", + " },\n", + " \"model_config\": {\n", + " \"name\": \"streaming_transducer\",\n", + " \"encoder_reductions\": {0: 3, 1: 2},\n", + " \"encoder_dmodel\": 320,\n", + " \"encoder_rnn_type\": \"lstm\",\n", + " \"encoder_rnn_units\": 1024,\n", + " \"encoder_nlayers\": 8,\n", + " \"encoder_layer_norm\": True,\n", + " \"prediction_embed_dim\": 320,\n", + " \"prediction_embed_dropout\": 0.0,\n", + " \"prediction_num_rnns\": 2,\n", + " \"prediction_rnn_units\": 1024,\n", + " \"prediction_rnn_type\": \"lstm\",\n", + " \"prediction_projection_units\": 320,\n", + " \"prediction_layer_norm\": True,\n", + " \"joint_dim\": 320,\n", + " \"joint_activation\": \"tanh\",\n", + " },\n", + " \"learning_config\": {\n", + " \"train_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"augmentation_config\": {\n", + " \"feature_augment\": {\n", + " \"time_masking\": {\n", + " \"num_masks\": 10,\n", + " \"mask_factor\": 100,\n", + " \"p_upperbound\": 0.05,\n", + " },\n", + " \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n", + " }\n", + " },\n", + " \"data_paths\": [\n", + " \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n", + " ],\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": True,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"train\",\n", + " },\n", + " \"eval_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": None,\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"eval\",\n", + " },\n", + " \"test_dataset_config\": {\n", + " \"use_tf\": True,\n", + " \"data_paths\": [\n", + " \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv\"\n", + " ],\n", + " \"tfrecords_dir\": None,\n", + " \"shuffle\": False,\n", + " \"cache\": True,\n", + " \"buffer_size\": 100,\n", + " \"drop_remainder\": True,\n", + " \"stage\": \"test\",\n", + " },\n", + " \"optimizer_config\": {\"class_name\": \"adam\", \"config\": {\"learning_rate\": 0.0001}},\n", + " \"running_config\": {\n", + " \"batch_size\": 2,\n", + " \"num_epochs\": 20,\n", + " \"checkpoint\": {\n", + " \"filepath\": \"/mnt/e/Models/local/rnn_transducer/checkpoints/{epoch:02d}.h5\",\n", + " \"save_best_only\": True,\n", + " \"save_weights_only\": True,\n", + " \"save_freq\": \"epoch\",\n", + " },\n", + " \"states_dir\": \"/mnt/e/Models/local/rnn_transducer/states\",\n", + " \"tensorboard\": {\n", + " \"log_dir\": \"/mnt/e/Models/local/rnn_transducer/tensorboard\",\n", + " \"histogram_freq\": 1,\n", + " \"write_graph\": True,\n", + " \"write_images\": True,\n", + " \"update_freq\": \"epoch\",\n", + " \"profile_batch\": 2,\n", + " },\n", + " },\n", + " },\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metadata = {\n", + " \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n", + " \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import math\n", + "import argparse\n", + "from tensorflow_asr.utils import env_util\n", + "\n", + "env_util.setup_environment()\n", + "import tensorflow as tf\n", + "\n", + "tf.keras.backend.clear_session()\n", + "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n", + "strategy = env_util.setup_strategy([0])\n", + "\n", + "from tensorflow_asr.configs.config import Config\n", + "from tensorflow_asr.datasets import asr_dataset\n", + "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n", + "from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer\n", + "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n", + "\n", + "config = Config(config)\n", + "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n", + "\n", + "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n", + "\n", + "train_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.train_dataset_config),\n", + " indefinite=True\n", + ")\n", + "eval_dataset = asr_dataset.ASRSliceDataset(\n", + " speech_featurizer=speech_featurizer,\n", + " text_featurizer=text_featurizer,\n", + " **vars(config.learning_config.eval_dataset_config),\n", + " indefinite=True\n", + ")\n", + "\n", + "train_dataset.load_metadata(metadata)\n", + "eval_dataset.load_metadata(metadata)\n", + "speech_featurizer.reset_length()\n", + "text_featurizer.reset_length()\n", + "\n", + "global_batch_size = config.learning_config.running_config.batch_size\n", + "global_batch_size *= strategy.num_replicas_in_sync\n", + "\n", + "train_data_loader = train_dataset.create(global_batch_size)\n", + "eval_data_loader = eval_dataset.create(global_batch_size)\n", + "\n", + "with strategy.scope():\n", + " # build model\n", + " rnnt = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n", + " rnnt._build(speech_featurizer.shape)\n", + " rnnt.summary(line_length=100)\n", + "\n", + " rnnt.compile(\n", + " optimizer=config.learning_config.optimizer_config,\n", + " experimental_steps_per_execution=10,\n", + " global_batch_size=global_batch_size,\n", + " blank=text_featurizer.blank\n", + " )\n", + "\n", + "callbacks = [\n", + " tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n", + " tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n", + " tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n", + "]\n", + "\n", + "rnnt.fit(\n", + " train_data_loader,\n", + " epochs=config.learning_config.running_config.num_epochs,\n", + " validation_data=eval_data_loader,\n", + " callbacks=callbacks,\n", + " steps_per_epoch=train_dataset.total_steps,\n", + " validation_steps=eval_dataset.total_steps\n", + ")" + ] + } + ] +} \ No newline at end of file diff --git a/scripts/create_vocab_from_trans.py b/scripts/create_vocab_from_trans.py index a4a2f20c61..a42148a98b 100644 --- a/scripts/create_vocab_from_trans.py +++ b/scripts/create_vocab_from_trans.py @@ -17,11 +17,9 @@ parser = argparse.ArgumentParser(prog="Create vocabulary file from transcripts") -parser.add_argument("--output", type=str, - default=None, help="The output .txt vocabulary file path") +parser.add_argument("--output", type=str, default=None, help="The output .txt vocabulary file path") -parser.add_argument("transcripts", nargs="+", type=str, - default=None, help="Transcript .tsv files") +parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Transcript .tsv files") args = parser.parse_args() diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index 48b0315943..2d6883d204 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -28,7 +28,7 @@ parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model") -parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata") +parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") @@ -57,4 +57,4 @@ stage=args.stage, shuffle=False, ) -dataset.update_metadata(args.metadata_prefix) +dataset.update_metadata(args.metadata) diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py index 028016e853..12fb73a959 100644 --- a/tensorflow_asr/configs/config.py +++ b/tensorflow_asr/configs/config.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union from ..augmentations.augmentation import Augmentation from ..utils import file_util @@ -75,8 +76,8 @@ def __init__(self, config: dict = None): class Config: """ User config class for training, testing or infering """ - def __init__(self, path: str): - config = file_util.load_yaml(file_util.preprocess_paths(path)) + def __init__(self, data: Union[str, dict]): + config = data if isinstance(data, dict) else file_util.load_yaml(file_util.preprocess_paths(data)) self.speech_config = config.pop("speech_config", {}) self.decoder_config = config.pop("decoder_config", {}) self.model_config = config.pop("model_config", {}) diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py index 1b6fdca3b6..2b2e61a7ea 100755 --- a/tensorflow_asr/datasets/asr_dataset.py +++ b/tensorflow_asr/datasets/asr_dataset.py @@ -14,6 +14,7 @@ import os import json +from typing import Union import tqdm import numpy as np import tensorflow as tf @@ -80,24 +81,27 @@ def save_metadata(self, metadata: str = None): f.write(json.dumps(content, indent=2)) print(f"Metadata written to {metadata}") - def load_metadata(self, metadata: str = None): + def load_metadata(self, metadata: Union[str, dict] = None): if metadata is None: return - metadata = file_util.preprocess_paths(metadata) - if tf.io.gfile.exists(metadata): - print(f"Loading metadata from {metadata} ...") - with tf.io.gfile.GFile(metadata, "r") as f: - try: - content = json.loads(f.read()).get(self.stage, {}) - except json.JSONDecodeError: - raise ValueError(f'File {metadata} must be in json format') - self.speech_featurizer.update_length(int(content.get("max_input_length", 0))) - self.text_featurizer.update_length(int(content.get("max_label_length", 0))) - self.total_steps = int(content.get("num_entries", 0)) - - def update_metadata(self, metadata_prefix: str = None): - self.load_metadata(metadata_prefix) + if isinstance(metadata, dict): + content = metadata + else: + metadata = file_util.preprocess_paths(metadata) + if tf.io.gfile.exists(metadata): + print(f"Loading metadata from {metadata} ...") + with tf.io.gfile.GFile(metadata, "r") as f: + try: + content = json.loads(f.read()).get(self.stage, {}) + except json.JSONDecodeError: + raise ValueError(f'File {metadata} must be in json format') + self.speech_featurizer.update_length(int(content.get("max_input_length", 0))) + self.text_featurizer.update_length(int(content.get("max_label_length", 0))) + self.total_steps = int(content.get("num_entries", 0)) + + def update_metadata(self, metadata: str = None): + self.load_metadata(metadata) self.compute_metadata() - self.save_metadata(metadata_prefix) + self.save_metadata(metadata) # -------------------------------- ENTRIES -------------------------------------