From 7ce069c8731e4d98b8fa903d50fb9b7ae5ba636a Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 10 Apr 2021 14:08:02 +0700
Subject: [PATCH 01/13] :rocket: init refactoring

---
 setup.py                                      |   2 +-
 tensorflow_asr/metrics/__init__.py            |   0
 tensorflow_asr/metrics/error_rates.py         |  33 ++++
 tensorflow_asr/models/__init__.py             |  75 ---------
 tensorflow_asr/models/activations/__init__.py |   0
 .../{activations.py => activations/glu.py}    |   0
 tensorflow_asr/models/base_model.py           |  75 +++++++++
 tensorflow_asr/models/conformer.py            |   6 +-
 tensorflow_asr/models/contextnet.py           |  20 ++-
 tensorflow_asr/models/ctc.py                  |  55 +++++-
 tensorflow_asr/models/deepspeech2.py          |   8 +-
 tensorflow_asr/models/jasper.py               |   4 +-
 tensorflow_asr/models/streaming_transducer.py |  10 +-
 tensorflow_asr/models/transducer.py           | 103 ++++++++----
 tensorflow_asr/utils/__init__.py              |  74 ---------
 tensorflow_asr/utils/env_util.py              |  77 +++++++++
 tensorflow_asr/utils/feature_util.py          |  27 +++
 tensorflow_asr/utils/file_util.py             |  57 +++++++
 tensorflow_asr/utils/layer_util.py            |  29 ++++
 .../utils/{utils.py => math_util.py}          | 156 ++----------------
 .../utils/{metrics.py => metric_util.py}      |  43 ++---
 tensorflow_asr/utils/shape_util.py            |  32 ++++
 22 files changed, 507 insertions(+), 379 deletions(-)
 create mode 100644 tensorflow_asr/metrics/__init__.py
 create mode 100644 tensorflow_asr/metrics/error_rates.py
 create mode 100644 tensorflow_asr/models/activations/__init__.py
 rename tensorflow_asr/models/{activations.py => activations/glu.py} (100%)
 mode change 100755 => 100644
 create mode 100644 tensorflow_asr/models/base_model.py
 create mode 100644 tensorflow_asr/utils/env_util.py
 create mode 100644 tensorflow_asr/utils/feature_util.py
 create mode 100644 tensorflow_asr/utils/file_util.py
 create mode 100644 tensorflow_asr/utils/layer_util.py
 rename tensorflow_asr/utils/{utils.py => math_util.py} (53%)
 mode change 100755 => 100644
 rename tensorflow_asr/utils/{metrics.py => metric_util.py} (67%)
 create mode 100644 tensorflow_asr/utils/shape_util.py

diff --git a/setup.py b/setup.py
index a2c415d29e..717cbcddfd 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.8.3",
+    version="1.0.0",
     author="Huy Le Nguyen",
     author_email="nlhuy.cs.16@gmail.com",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
diff --git a/tensorflow_asr/metrics/__init__.py b/tensorflow_asr/metrics/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/metrics/error_rates.py b/tensorflow_asr/metrics/error_rates.py
new file mode 100644
index 0000000000..143e199109
--- /dev/null
+++ b/tensorflow_asr/metrics/error_rates.py
@@ -0,0 +1,33 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+class ErrorRate(tf.keras.metrics.Metric):
+    """ Metric for WER or CER """
+
+    def __init__(self, func, name="error_rate", **kwargs):
+        super(ErrorRate, self).__init__(name=name, **kwargs)
+        self.numerator = self.add_weight(name=f"{name}_numerator", initializer="zeros")
+        self.denominator = self.add_weight(name=f"{name}_denominator", initializer="zeros")
+        self.func = func
+
+    def update_state(self, decode: tf.Tensor, target: tf.Tensor):
+        n, d = self.func(decode, target)
+        self.numerator.assign_add(n)
+        self.denominator.assign_add(d)
+
+    def result(self):
+        return tf.math.divide_no_nan(self.numerator, self.denominator) * 100
diff --git a/tensorflow_asr/models/__init__.py b/tensorflow_asr/models/__init__.py
index 7f37b4ffb1..e69de29bb2 100644
--- a/tensorflow_asr/models/__init__.py
+++ b/tensorflow_asr/models/__init__.py
@@ -1,75 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import abc
-import tempfile
-import tensorflow as tf
-
-from ..utils.utils import is_cloud_path, is_hdf5_filepath
-
-
-class Model(tf.keras.Model):
-    def __init__(self, name, **kwargs):
-        super(Model, self).__init__(name=name, **kwargs)
-
-    def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None,
-             signatures=None, options=None, save_traces=True):
-        if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                super(Model, self).save(
-                    tmp.name, overwrite=overwrite, include_optimizer=include_optimizer,
-                    save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
-                )
-                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
-        else:
-            super(Model, self).save(
-                filepath, overwrite=overwrite, include_optimizer=include_optimizer,
-                save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
-            )
-
-    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
-        if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options)
-                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
-        else:
-            super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options)
-
-    def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None):
-        if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                tf.io.gfile.copy(filepath, tmp.name, overwrite=True)
-                super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
-        else:
-            super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
-
-    @abc.abstractmethod
-    def _build(self, *args, **kwargs):
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def call(self, inputs, training=False, **kwargs):
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def recognize(self, features, input_lengths, **kwargs):
-        pass
-
-    @abc.abstractmethod
-    def recognize_beam(self, features, input_lengths, **kwargs):
-        pass
diff --git a/tensorflow_asr/models/activations/__init__.py b/tensorflow_asr/models/activations/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/activations.py b/tensorflow_asr/models/activations/glu.py
old mode 100755
new mode 100644
similarity index 100%
rename from tensorflow_asr/models/activations.py
rename to tensorflow_asr/models/activations/glu.py
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
new file mode 100644
index 0000000000..c545577abc
--- /dev/null
+++ b/tensorflow_asr/models/base_model.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import abc
+import tempfile
+import tensorflow as tf
+
+from ..utils import file_util
+
+
+class Model(tf.keras.Model):
+    def __init__(self, name, **kwargs):
+        super(Model, self).__init__(name=name, **kwargs)
+
+    def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None,
+             signatures=None, options=None, save_traces=True):
+        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
+            _, ext = os.path.splitext(filepath)
+            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
+                super(Model, self).save(
+                    tmp.name, overwrite=overwrite, include_optimizer=include_optimizer,
+                    save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
+                )
+                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
+        else:
+            super(Model, self).save(
+                filepath, overwrite=overwrite, include_optimizer=include_optimizer,
+                save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
+            )
+
+    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
+        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
+            _, ext = os.path.splitext(filepath)
+            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
+                super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options)
+                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
+        else:
+            super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options)
+
+    def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None):
+        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
+            _, ext = os.path.splitext(filepath)
+            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
+                tf.io.gfile.copy(filepath, tmp.name, overwrite=True)
+                super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
+        else:
+            super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
+
+    @abc.abstractmethod
+    def _build(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def call(self, inputs, training=False, **kwargs):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def recognize(self, features, input_lengths, **kwargs):
+        pass
+
+    @abc.abstractmethod
+    def recognize_beam(self, features, input_lengths, **kwargs):
+        pass
diff --git a/tensorflow_asr/models/conformer.py b/tensorflow_asr/models/conformer.py
index 0fa3585ce4..a13dfa1d19 100755
--- a/tensorflow_asr/models/conformer.py
+++ b/tensorflow_asr/models/conformer.py
@@ -14,12 +14,12 @@
 
 import tensorflow as tf
 
-from .activations import GLU
+from .activations.glu import GLU
 from .transducer import Transducer
 from .layers.subsampling import VggSubsampling, Conv2dSubsampling
 from .layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
 from .layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
-from ..utils.utils import shape_list
+from ..utils import shape_util
 
 L2 = tf.keras.regularizers.l2(1e-6)
 
@@ -179,7 +179,7 @@ def __init__(self,
 
     def call(self, inputs, training=False, **kwargs):
         outputs = self.ln(inputs, training=training)
-        B, T, E = shape_list(outputs)
+        B, T, E = shape_util.shape_list(outputs)
         outputs = tf.reshape(outputs, [B, T, 1, E])
         outputs = self.pw_conv_1(outputs, training=training)
         outputs = self.glu(outputs)
diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/contextnet.py
index 8bc4e12857..636560101d 100644
--- a/tensorflow_asr/models/contextnet.py
+++ b/tensorflow_asr/models/contextnet.py
@@ -16,7 +16,7 @@
 from typing import List
 import tensorflow as tf
 from .transducer import Transducer
-from ..utils.utils import merge_two_last_dims, get_reduced_length
+from ..utils import math_util
 
 L2 = tf.keras.regularizers.l2(1e-6)
 
@@ -30,7 +30,7 @@ def get_activation(activation: str = "silu"):
 
 
 class Reshape(tf.keras.layers.Layer):
-    def call(self, inputs): return merge_two_last_dims(inputs)
+    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
 
 
 class ConvModule(tf.keras.layers.Layer):
@@ -154,7 +154,7 @@ def call(self, inputs, training=False, **kwargs):
         for conv in self.convs:
             outputs = conv(outputs, training=training)
         outputs = self.last_conv(outputs, training=training)
-        input_length = get_reduced_length(input_length, self.last_conv.strides)
+        input_length = math_util.get_reduced_length(input_length, self.last_conv.strides)
         outputs = self.se([outputs, input_length], training=training)
         if self.residual is not None:
             res = self.residual(features, training=training)
@@ -282,8 +282,11 @@ def recognize(self,
             tf.Tensor: a batch of decoded transcripts
         """
         encoded = self.encoder([features, input_length], training=False)
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        return self._perform_greedy_batch(
+            encoded, input_length,
+            parallel_iterations=parallel_iterations,
+            swap_memory=swap_memory
+        )
 
     def recognize_tflite(self, signal, predicted, prediction_states):
         """
@@ -347,5 +350,8 @@ def recognize_beam(self,
             tf.Tensor: a batch of decoded transcripts
         """
         encoded = self.encoder([features, input_length], training=False)
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        return self._perform_beam_search_batch(
+            encoded, input_length, lm,
+            parallel_iterations=parallel_iterations,
+            swap_memory=swap_memory
+        )
diff --git a/tensorflow_asr/models/ctc.py b/tensorflow_asr/models/ctc.py
index 0e12c52c79..a95949544b 100644
--- a/tensorflow_asr/models/ctc.py
+++ b/tensorflow_asr/models/ctc.py
@@ -15,11 +15,13 @@
 from typing import Optional
 import numpy as np
 import tensorflow as tf
+from tensorflow.keras import mixed_precision as mxp
 
 from . import Model
 from ..featurizers.speech_featurizers import TFSpeechFeaturizer
 from ..featurizers.text_featurizers import TextFeaturizer
-from ..utils.utils import shape_list, get_reduced_length
+from ..utils import math_util, shape_util
+from ..losses.keras.ctc_losses import CtcLoss
 
 
 class CtcModel(Model):
@@ -31,6 +33,49 @@ def _build(self, input_shape, batch_size=None):
         features = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32)
         self(features, training=False)
 
+    @property
+    def metrics(self):
+        return [self.loss_metric]
+
+    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
+        loss = CtcLoss(blank=blank, global_batch_size=global_batch_size)
+        self.use_loss_scale = use_loss_scale
+        if self.use_loss_scale:
+            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
+        self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32)
+        super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
+
+    def train_step(self, batch):
+        x, y_true = batch
+        with tf.GradientTape() as tape:
+            logit = self(x["input"], training=True)
+            y_pred = {
+                "logit": logit,
+                "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor)
+            }
+            loss = self.loss(y_true, y_pred)
+            if self.use_loss_scale:
+                scaled_loss = self.optimizer.get_scaled_loss(loss)
+        if self.use_loss_scale:
+            scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights)
+            gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
+        else:
+            gradients = tape.gradient(loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        self.loss_metric.update_state(loss)
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, batch):
+        x, y_true = batch
+        logit = self(x["input"], training=False)
+        y_pred = {
+            "logit": logit,
+            "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor)
+        }
+        loss = self.loss(y_true, y_pred)
+        self.loss_metric.update_state(loss)
+        return {m.name: m.result() for m in self.metrics}
+
     def add_featurizers(self,
                         speech_featurizer: TFSpeechFeaturizer,
                         text_featurizer: TextFeaturizer):
@@ -67,8 +112,8 @@ def recognize_tflite(self, signal):
         """
         features = self.speech_featurizer.tf_extract(signal)
         features = tf.expand_dims(features, axis=0)
-        input_length = shape_list(features)[1]
-        input_length = get_reduced_length(input_length, self.time_reduction_factor)
+        input_length = shape_util.shape_list(features)[1]
+        input_length = math_util.get_reduced_length(input_length, self.time_reduction_factor)
         input_length = tf.expand_dims(input_length, axis=0)
         logits = self(features, training=False)
         probs = tf.nn.softmax(logits)
@@ -113,8 +158,8 @@ def recognize_beam_tflite(self, signal):
         """
         features = self.speech_featurizer.tf_extract(signal)
         features = tf.expand_dims(features, axis=0)
-        input_length = shape_list(features)[1]
-        input_length = get_reduced_length(input_length, self.time_reduction_factor)
+        input_length = shape_util.shape_list(features)[1]
+        input_length = math_util.get_reduced_length(input_length, self.time_reduction_factor)
         input_length = tf.expand_dims(input_length, axis=0)
         logits = self(features, training=False)
         probs = tf.nn.softmax(logits)
diff --git a/tensorflow_asr/models/deepspeech2.py b/tensorflow_asr/models/deepspeech2.py
index 6bc99fe1f9..1e855c5ef3 100644
--- a/tensorflow_asr/models/deepspeech2.py
+++ b/tensorflow_asr/models/deepspeech2.py
@@ -14,14 +14,14 @@
 
 import tensorflow as tf
 
-from ..utils.utils import get_rnn, get_conv, merge_two_last_dims
+from ..utils import layer_util, math_util
 from .layers.row_conv_1d import RowConv1D
 from .layers.sequence_wise_bn import SequenceBatchNorm
 from .ctc import CtcModel
 
 
 class Reshape(tf.keras.layers.Layer):
-    def call(self, inputs): return merge_two_last_dims(inputs)
+    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
 
 
 class ConvBlock(tf.keras.layers.Layer):
@@ -34,7 +34,7 @@ def __init__(self,
                  **kwargs):
         super(ConvBlock, self).__init__(**kwargs)
 
-        CNN = get_conv(conv_type)
+        CNN = layer_util.get_conv(conv_type)
         self.conv = CNN(filters=filters, kernel_size=kernels,
                         strides=strides, padding="same",
                         dtype=tf.float32, name=f"{self.name}_{conv_type}")
@@ -118,7 +118,7 @@ def __init__(self,
                  **kwargs):
         super(RnnBlock, self).__init__(**kwargs)
 
-        RNN = get_rnn(rnn_type)
+        RNN = layer_util.get_rnn(rnn_type)
         self.rnn = RNN(units, dropout=dropout, return_sequences=True,
                        use_bias=True, name=f"{self.name}_{rnn_type}")
         if bidirectional:
diff --git a/tensorflow_asr/models/jasper.py b/tensorflow_asr/models/jasper.py
index 70709da644..a8b0780403 100644
--- a/tensorflow_asr/models/jasper.py
+++ b/tensorflow_asr/models/jasper.py
@@ -14,12 +14,12 @@
 
 import tensorflow as tf
 
-from ..utils.utils import merge_two_last_dims
+from ..utils import math_util
 from .ctc import CtcModel
 
 
 class Reshape(tf.keras.layers.Layer):
-    def call(self, inputs): return merge_two_last_dims(inputs)
+    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
 
 
 class JasperSubBlock(tf.keras.layers.Layer):
diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/streaming_transducer.py
index 266db0e13e..ba793126e2 100644
--- a/tensorflow_asr/models/streaming_transducer.py
+++ b/tensorflow_asr/models/streaming_transducer.py
@@ -17,11 +17,11 @@
 
 from .layers.subsampling import TimeReduction
 from .transducer import Transducer
-from ..utils.utils import get_rnn, merge_two_last_dims, shape_list
+from ..utils import layer_util, math_util, shape_util
 
 
 class Reshape(tf.keras.layers.Layer):
-    def call(self, inputs): return merge_two_last_dims(inputs)
+    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
 
 
 class StreamingTransducerBlock(tf.keras.Model):
@@ -41,7 +41,7 @@ def __init__(self,
         else:
             self.reduction = None
 
-        RNN = get_rnn(rnn_type)
+        RNN = layer_util.get_rnn(rnn_type)
         self.rnn = RNN(
             units=rnn_units, return_sequences=True,
             name=f"{self.name}_{rnn_type}", return_state=True,
@@ -269,7 +269,7 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_list(features)
+        batch_size, _, _, _ = shape_util.shape_list(features)
         encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
         return self._perform_greedy_batch(encoded, input_length,
                                           parallel_iterations=parallel_iterations, swap_memory=swap_memory)
@@ -336,7 +336,7 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_list(features)
+        batch_size, _, _, _ = shape_util.shape_list(features)
         encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
         return self._perform_beam_search_batch(encoded, input_length, lm,
                                                parallel_iterations=parallel_iterations, swap_memory=swap_memory)
diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer.py
index 6195e2ee7d..efd3c4d55e 100755
--- a/tensorflow_asr/models/transducer.py
+++ b/tensorflow_asr/models/transducer.py
@@ -15,12 +15,14 @@
 
 import collections
 import tensorflow as tf
+from tensorflow.keras import mixed_precision as mxp
 
 from . import Model
-from ..utils.utils import get_rnn, shape_list, count_non_blank, pad_prediction_tfarray
+from ..utils import math_util, layer_util, shape_util
 from ..featurizers.speech_featurizers import SpeechFeaturizer
 from ..featurizers.text_featurizers import TextFeaturizer
 from .layers.embedding import Embedding
+from ..losses.keras.rnnt_losses import RnntLoss
 
 Hypothesis = collections.namedtuple("Hypothesis", ("index", "prediction", "states"))
 
@@ -47,7 +49,7 @@ def __init__(self,
                                regularizer=kernel_regularizer, name=f"{name}_embedding")
         self.do = tf.keras.layers.Dropout(embed_dropout, name=f"{name}_dropout")
         # Initialize rnn layers
-        RNN = get_rnn(rnn_type)
+        RNN = layer_util.get_rnn(rnn_type)
         self.rnns = []
         for i in range(num_rnns):
             rnn = RNN(
@@ -302,12 +304,21 @@ def __init__(self,
         )
         self.time_reduction_factor = 1
 
+    @property
+    def metrics(self):
+        return [self.loss_metric]
+
     def _build(self, input_shape, prediction_shape=[None], batch_size=None):
         inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32)
         input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
         pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
         pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        self([inputs, input_length, pred, pred_length], training=False)
+        self({
+            "input": inputs,
+            "input_length": input_length,
+            "prediction": pred,
+            "prediction_length": pred_length
+        }, training=False)
 
     def summary(self, line_length=None, **kwargs):
         if self.encoder is not None: self.encoder.summary(line_length=line_length, **kwargs)
@@ -328,25 +339,25 @@ def add_featurizers(self,
         self.speech_featurizer = speech_featurizer
         self.text_featurizer = text_featurizer
 
-    def call(self, inputs, training=False, **kwargs):
-        """
-        Transducer Model call function
-        Args:
-            features: audio features in shape [B, T, F, C]
-            input_length: features time length in shape [B]
-            prediction: predicted sequence of ids, in shape [B, U]
-            prediction_length: predicted sequence of ids length in shape [B]
-            training: python boolean
-            **kwargs: sth else
+    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
+        loss = RnntLoss(blank=blank, global_batch_size=global_batch_size)
+        self.use_loss_scale = use_loss_scale
+        if self.use_loss_scale:
+            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
+        self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32)
+        super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
 
-        Returns:
-            `logits` with shape [B, T, U, vocab]
-        """
-        features, _, prediction, prediction_length = inputs
+    def call(self, inputs, training=False, **kwargs):
+        features = inputs["input"]
+        prediction = inputs["prediction"]
+        prediction_length = inputs["prediction_length"]
         enc = self.encoder(features, training=training, **kwargs)
         pred = self.predict_net([prediction, prediction_length], training=training, **kwargs)
         outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return outputs
+        return {
+            "logit": outputs,
+            "logit_length": math_util.get_reduced_length(inputs["input_length"], self.time_reduction_factor)
+        }
 
     # -------------------------------- INFERENCES-------------------------------------
 
@@ -485,7 +496,7 @@ def body(batch, decoded):
                 parallel_iterations=parallel_iterations, swap_memory=True,
             )
 
-            decoded = pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank)
+            decoded = math_util.pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank)
             return self.text_featurizer.iextract(decoded.stack())
 
     def _perform_greedy(self,
@@ -641,7 +652,7 @@ def body(batch, decoded):
                 parallel_iterations=parallel_iterations, swap_memory=True,
             )
 
-            decoded = pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank)
+            decoded = math_util.pad_prediction_tfarray(decoded, blank=self.text_featurizer.blank)
             return self.text_featurizer.iextract(decoded.stack())
 
     def _perform_beam_search(self,
@@ -661,20 +672,32 @@ def _perform_beam_search(self,
             def initialize_beam(dynamic=False):
                 return BeamHypothesis(
                     score=tf.TensorArray(
-                        dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic,
-                        element_shape=tf.TensorShape([]), clear_after_read=False
+                        dtype=tf.float32,
+                        size=beam_width if not dynamic else 0,
+                        dynamic_size=dynamic,
+                        element_shape=tf.TensorShape([]),
+                        clear_after_read=False
                     ),
                     indices=tf.TensorArray(
-                        dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic,
-                        element_shape=tf.TensorShape([]), clear_after_read=False
+                        dtype=tf.int32,
+                        size=beam_width if not dynamic else 0,
+                        dynamic_size=dynamic,
+                        element_shape=tf.TensorShape([]),
+                        clear_after_read=False
                     ),
                     prediction=tf.TensorArray(
-                        dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic,
-                        element_shape=None, clear_after_read=False
+                        dtype=tf.int32,
+                        size=beam_width if not dynamic else 0,
+                        dynamic_size=dynamic,
+                        element_shape=None,
+                        clear_after_read=False
                     ),
                     states=tf.TensorArray(
-                        dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic,
-                        element_shape=tf.TensorShape(shape_list(self.predict_net.get_initial_state())), clear_after_read=False
+                        dtype=tf.float32,
+                        size=beam_width if not dynamic else 0,
+                        dynamic_size=dynamic,
+                        element_shape=tf.TensorShape(shape_util.shape_list(self.predict_net.get_initial_state())),
+                        clear_after_read=False
                     ),
                 )
 
@@ -694,7 +717,11 @@ def body(time, total, B):
                     score=A.score.unstack(B.score.stack()),
                     indices=A.indices.unstack(B.indices.stack()),
                     prediction=A.prediction.unstack(
-                        pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack()),
+                        math_util.pad_prediction_tfarray(
+                            B.prediction,
+                            blank=self.text_featurizer.blank
+                        ).stack()
+                    ),
                     states=A.states.unstack(B.states.stack()),
                 )
                 A_i = tf.constant(0, tf.int32)
@@ -710,7 +737,9 @@ def beam_body(beam, beam_width, A, A_i, B):
                     y_hat_score = y_hat_score[0]
                     y_hat_index = tf.gather_nd(A.indices.stack(), y_hat_score_index)
                     y_hat_prediction = tf.gather_nd(
-                        pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), y_hat_score_index)
+                        math_util.pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(),
+                        y_hat_score_index
+                    )
                     y_hat_states = tf.gather_nd(A.states.stack(), y_hat_score_index)
 
                     # remove y_hat from A
@@ -720,8 +749,12 @@ def beam_body(beam, beam_width, A, A_i, B):
                     A = BeamHypothesis(
                         score=A.score.unstack(tf.gather_nd(A.score.stack(), remain_indices)),
                         indices=A.indices.unstack(tf.gather_nd(A.indices.stack(), remain_indices)),
-                        prediction=A.prediction.unstack(tf.gather_nd(
-                            pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(), remain_indices)),
+                        prediction=A.prediction.unstack(
+                            tf.gather_nd(
+                                math_util.pad_prediction_tfarray(A.prediction, blank=self.text_featurizer.blank).stack(),
+                                remain_indices
+                            )
+                        ),
                         states=A.states.unstack(tf.gather_nd(A.states.stack(), remain_indices)),
                     )
                     A_i = tf.cond(tf.equal(A_i, 0), true_fn=lambda: A_i, false_fn=lambda: A_i - 1)
@@ -747,7 +780,7 @@ def true_fn():
                             )
 
                         def false_fn():
-                            scatter_index = count_non_blank(y_hat_prediction, blank=self.text_featurizer.blank)
+                            scatter_index = math_util.count_non_blank(y_hat_prediction, blank=self.text_featurizer.blank)
                             updated_prediction = tf.tensor_scatter_nd_update(
                                 y_hat_prediction,
                                 indices=tf.reshape(scatter_index, [1, 1]),
@@ -797,9 +830,9 @@ def false_fn():
             )
 
             scores = B.score.stack()
-            prediction = pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack()
+            prediction = math_util.pad_prediction_tfarray(B.prediction, blank=self.text_featurizer.blank).stack()
             if self.text_featurizer.decoder_config.norm_score:
-                prediction_lengths = count_non_blank(prediction, blank=self.text_featurizer.blank, axis=1)
+                prediction_lengths = math_util.count_non_blank(prediction, blank=self.text_featurizer.blank, axis=1)
                 scores /= tf.cast(prediction_lengths, dtype=scores.dtype)
 
             y_hat_score, y_hat_score_index = tf.math.top_k(scores, k=1)
diff --git a/tensorflow_asr/utils/__init__.py b/tensorflow_asr/utils/__init__.py
index e7becd8f27..e69de29bb2 100644
--- a/tensorflow_asr/utils/__init__.py
+++ b/tensorflow_asr/utils/__init__.py
@@ -1,74 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def setup_environment():  # Set memory growth and only log ERRORs
-    """ Setting tensorflow running environment """
-    import warnings
-
-    warnings.simplefilter("ignore")
-
-    import tensorflow as tf
-
-    tf.get_logger().setLevel("ERROR")
-
-    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-
-def setup_devices(devices, cpu=False):
-    """Setting visible devices
-
-    Args:
-        devices (list): list of visible devices' indices
-    """
-    import tensorflow as tf
-
-    if cpu:
-        cpus = tf.config.list_physical_devices("CPU")
-        tf.config.set_visible_devices(cpus, "CPU")
-    else:
-        gpus = tf.config.list_physical_devices("GPU")
-        if gpus:
-            visible_gpus = [gpus[i] for i in devices]
-            tf.config.set_visible_devices(visible_gpus, "GPU")
-            print("Run on", len(visible_gpus), "Physical GPUs")
-
-
-def setup_strategy(devices):
-    """Setting mirrored strategy for training
-
-    Args:
-        devices (list): list of visible devices' indices
-
-    Returns:
-        tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus
-    """
-    import tensorflow as tf
-
-    setup_devices(devices)
-
-    return tf.distribute.MirroredStrategy()
-
-
-def setup_tpu(tpu_address=None):
-    import tensorflow as tf
-
-    if tpu_address is None:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
-    else:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    print("All TPUs: ", tf.config.list_logical_devices('TPU'))
-    return tf.distribute.experimental.TPUStrategy(resolver)
diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py
new file mode 100644
index 0000000000..2bf4970415
--- /dev/null
+++ b/tensorflow_asr/utils/env_util.py
@@ -0,0 +1,77 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def setup_environment():  # Set memory growth and only log ERRORs
+    """ Setting tensorflow running environment """
+    import warnings
+    warnings.simplefilter("ignore")
+    tf.get_logger().setLevel("ERROR")
+    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
+
+
+def setup_devices(devices, cpu=False):
+    """Setting visible devices
+
+    Args:
+        devices (list): list of visible devices' indices
+    """
+    if cpu:
+        cpus = tf.config.list_physical_devices("CPU")
+        tf.config.set_visible_devices(cpus, "CPU")
+    else:
+        gpus = tf.config.list_physical_devices("GPU")
+        if gpus:
+            visible_gpus = [gpus[i] for i in devices]
+            tf.config.set_visible_devices(visible_gpus, "GPU")
+            print("Run on", len(visible_gpus), "Physical GPUs")
+
+
+def setup_strategy(devices):
+    """Setting mirrored strategy for training
+
+    Args:
+        devices (list): list of visible devices' indices
+
+    Returns:
+        tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus
+    """
+    setup_devices(devices)
+    return tf.distribute.MirroredStrategy()
+
+
+def setup_tpu(tpu_address=None):
+    if tpu_address is None:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+    else:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + tpu_address)
+    tf.config.experimental_connect_to_cluster(resolver)
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    print("All TPUs: ", tf.config.list_logical_devices('TPU'))
+    return tf.distribute.experimental.TPUStrategy(resolver)
+
+
+def has_gpu_or_tpu():
+    gpus = tf.config.list_logical_devices("GPU")
+    tpus = tf.config.list_logical_devices("TPU")
+    if len(gpus) == 0 and len(tpus) == 0: return False
+    return True
+
+
+def has_tpu():
+    tpus = tf.config.list_logical_devices("TPU")
+    if len(tpus) == 0: return False
+    return True
diff --git a/tensorflow_asr/utils/feature_util.py b/tensorflow_asr/utils/feature_util.py
new file mode 100644
index 0000000000..0d8a294ce1
--- /dev/null
+++ b/tensorflow_asr/utils/feature_util.py
@@ -0,0 +1,27 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def float_feature(list_of_floats):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
+
+
+def int64_feature(list_of_ints):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))
+
+
+def bytestring_feature(list_of_bytestrings):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))
diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py
new file mode 100644
index 0000000000..c9d1c867d0
--- /dev/null
+++ b/tensorflow_asr/utils/file_util.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from typing import Union, List
+import tensorflow as tf
+
+
+def is_hdf5_filepath(filepath):
+    return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5'))
+
+
+def is_cloud_path(path):
+    """ Check if the path is on cloud (which requires tf.io.gfile)
+
+    Args:
+        path (str): Path to directory or file
+
+    Returns:
+        bool: True if path is on cloud, False otherwise
+    """
+    return bool(re.match(r"^[a-z]+://", path))
+
+
+def preprocess_paths(paths: Union[List, str]):
+    """Expand the path to the root "/"
+
+    Args:
+        paths (Union[List, str]): A path or list of paths
+
+    Returns:
+        Union[List, str]: A processed path or list of paths, return None if it's not path
+    """
+    if isinstance(paths, list):
+        return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths]
+    elif isinstance(paths, str):
+        return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths))
+    else:
+        return None
+
+
+def read_bytes(path: str) -> tf.Tensor:
+    with tf.io.gfile.GFile(path, "rb") as f:
+        content = f.read()
+    return tf.convert_to_tensor(content, dtype=tf.string)
diff --git a/tensorflow_asr/utils/layer_util.py b/tensorflow_asr/utils/layer_util.py
new file mode 100644
index 0000000000..6e2647f581
--- /dev/null
+++ b/tensorflow_asr/utils/layer_util.py
@@ -0,0 +1,29 @@
+
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def get_rnn(rnn_type: str):
+    assert rnn_type in ["lstm", "gru", "rnn"]
+    if rnn_type == "lstm": return tf.keras.layers.LSTM
+    if rnn_type == "gru": return tf.keras.layers.GRU
+    return tf.keras.layers.SimpleRNN
+
+
+def get_conv(conv_type):
+    assert conv_type in ["conv1d", "conv2d"]
+    if conv_type == "conv1d": return tf.keras.layers.Conv1D
+    return tf.keras.layers.Conv2D
diff --git a/tensorflow_asr/utils/utils.py b/tensorflow_asr/utils/math_util.py
old mode 100755
new mode 100644
similarity index 53%
rename from tensorflow_asr/utils/utils.py
rename to tensorflow_asr/utils/math_util.py
index fef55a0cf6..451a9bcb03
--- a/tensorflow_asr/utils/utils.py
+++ b/tensorflow_asr/utils/math_util.py
@@ -12,74 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-import os
-import sys
 import math
-from typing import Union, List
-
 import numpy as np
 import tensorflow as tf
 
+from . import shape_util
 
-def float_feature(list_of_floats):
-    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
-
-
-def int64_feature(list_of_ints):
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))
-
-
-def bytestring_feature(list_of_bytestrings):
-    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))
-
-
-def append_default_keys_dict(default_dict, dest_dict):
-    if not dest_dict:
-        return default_dict
-    for key in default_dict.keys():
-        if key not in dest_dict.keys():
-            dest_dict[key] = default_dict[key]
-    return dest_dict
-
-
-def check_key_in_dict(dictionary, keys):
-    for key in keys:
-        if key not in dictionary.keys():
-            raise ValueError("{} must be defined".format(key))
 
+def log10(x):
+    numerator = tf.math.log(x)
+    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
+    return numerator / denominator
 
-def is_hdf5_filepath(filepath):
-    return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5'))
-
-
-def is_cloud_path(path):
-    """ Check if the path is on cloud (which requires tf.io.gfile)
-
-    Args:
-        path (str): Path to directory or file
-
-    Returns:
-        bool: True if path is on cloud, False otherwise
-    """
-    return bool(re.match(r"^[a-z]+://", path))
-
-
-def preprocess_paths(paths: Union[List, str]):
-    """Expand the path to the root "/"
-
-    Args:
-        paths (Union[List, str]): A path or list of paths
 
-    Returns:
-        Union[List, str]: A processed path or list of paths, return None if it's not path
-    """
-    if isinstance(paths, list):
-        return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths]
-    elif isinstance(paths, str):
-        return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths))
-    else:
-        return None
+def get_num_batches(samples, batch_size, drop_remainders=True):
+    if samples is None or batch_size is None: return None
+    if drop_remainders: return math.floor(float(samples) / float(batch_size))
+    return math.ceil(float(samples) / float(batch_size))
 
 
 def nan_to_zero(input_tensor):
@@ -91,65 +40,23 @@ def bytes_to_string(array: np.ndarray, encoding: str = "utf-8"):
     return [transcript.decode(encoding) for transcript in array]
 
 
-def get_num_batches(samples, batch_size, drop_remainders=True):
-    if samples is None or batch_size is None: return None
-    if drop_remainders: return math.floor(float(samples) / float(batch_size))
-    return math.ceil(float(samples) / float(batch_size))
-
-
-def merge_two_last_dims(x):
-    b, _, f, c = shape_list(x)
-    return tf.reshape(x, shape=[b, -1, f * c])
-
-
-def get_rnn(rnn_type: str):
-    assert rnn_type in ["lstm", "gru", "rnn"]
-    if rnn_type.lower() == "lstm": return tf.keras.layers.LSTM
-    if rnn_type.lower() == "gru": return tf.keras.layers.GRU
-    return tf.keras.layers.SimpleRNN
-
-
-def get_conv(conv_type):
-    assert conv_type in ["conv1d", "conv2d"]
-
-    if conv_type == "conv1d":
-        return tf.keras.layers.Conv1D
-
-    return tf.keras.layers.Conv2D
-
-
-def print_one_line(*args):
-    tf.print("\033[K", end="")
-    tf.print("\r", *args, sep="", end=" ", output_stream=sys.stdout)
-
-
-def read_bytes(path: str) -> tf.Tensor:
-    with tf.io.gfile.GFile(path, "rb") as f:
-        content = f.read()
-    return tf.convert_to_tensor(content, dtype=tf.string)
-
-
-def shape_list(x, out_type=tf.int32):
-    """Deal with dynamic shape in tensorflow cleanly."""
-    static = x.shape.as_list()
-    dynamic = tf.shape(x, out_type=out_type)
-    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+def get_reduced_length(length, reduction_factor):
+    return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32)
 
 
-def get_shape_invariants(tensor):
-    shapes = shape_list(tensor)
-    return tf.TensorShape([i if isinstance(i, int) else None for i in shapes])
+def count_non_blank(tensor: tf.Tensor, blank: int or tf.Tensor = 0, axis=None):
+    return tf.reduce_sum(tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), axis=axis)
 
 
-def get_float_spec(tensor):
-    shape = get_shape_invariants(tensor)
-    return tf.TensorSpec(shape, dtype=tf.float32)
+def merge_two_last_dims(x):
+    b, _, f, c = shape_util.shape_list(x)
+    return tf.reshape(x, shape=[b, -1, f * c])
 
 
 def merge_repeated(yseqs, blank=0):
     result = tf.reshape(yseqs[0], [1])
 
-    U = shape_list(yseqs)[0]
+    U = shape_util.shape_list(yseqs)[0]
     i = tf.constant(1, dtype=tf.int32)
 
     def _cond(i, result, yseqs, U): return tf.less(i, U)
@@ -171,34 +78,7 @@ def _body(i, result, yseqs, U):
         )
     )
 
-    return tf.pad(result, [[U - shape_list(result)[0], 0]], constant_values=blank)
-
-
-def log10(x):
-    numerator = tf.math.log(x)
-    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
-    return numerator / denominator
-
-
-def get_reduced_length(length, reduction_factor):
-    return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32)
-
-
-def count_non_blank(tensor: tf.Tensor, blank: int or tf.Tensor = 0, axis=None):
-    return tf.reduce_sum(tf.where(tf.not_equal(tensor, blank), x=tf.ones_like(tensor), y=tf.zeros_like(tensor)), axis=axis)
-
-
-def has_gpu_or_tpu():
-    gpus = tf.config.list_logical_devices("GPU")
-    tpus = tf.config.list_logical_devices("TPU")
-    if len(gpus) == 0 and len(tpus) == 0: return False
-    return True
-
-
-def has_tpu():
-    tpus = tf.config.list_logical_devices("TPU")
-    if len(tpus) == 0: return False
-    return True
+    return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]], constant_values=blank)
 
 
 def find_max_length_prediction_tfarray(tfarray: tf.TensorArray) -> tf.Tensor:
diff --git a/tensorflow_asr/utils/metrics.py b/tensorflow_asr/utils/metric_util.py
similarity index 67%
rename from tensorflow_asr/utils/metrics.py
rename to tensorflow_asr/utils/metric_util.py
index efb59ed452..c26dcc451f 100644
--- a/tensorflow_asr/utils/metrics.py
+++ b/tensorflow_asr/utils/metric_util.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 
 from typing import Tuple
-import tensorflow as tf
 from nltk.metrics import distance
-from .utils import bytes_to_string
+import tensorflow as tf
 
+from . import math_util
 
-def _wer(decode, target):
-    decode = bytes_to_string(decode)
-    target = bytes_to_string(target)
+
+def execute_wer(decode, target):
+    decode = math_util.bytes_to_string(decode)
+    target = math_util.bytes_to_string(target)
     dis = 0.0
     length = 0.0
     for dec, tar in zip(decode, target):
@@ -35,7 +36,7 @@ def _wer(decode, target):
     return tf.convert_to_tensor(dis, tf.float32), tf.convert_to_tensor(length, tf.float32)
 
 
-def wer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+def wer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
     """Word Error Rate
 
     Args:
@@ -45,12 +46,12 @@ def wer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
     Returns:
         tuple: a tuple of tf.Tensor of (edit distances, number of words) of each text
     """
-    return tf.numpy_function(_wer, inp=[_decode, _target], Tout=[tf.float32, tf.float32])
+    return tf.numpy_function(execute_wer, inp=[decode, target], Tout=[tf.float32, tf.float32])
 
 
-def _cer(decode, target):
-    decode = bytes_to_string(decode)
-    target = bytes_to_string(target)
+def execute_cer(decode, target):
+    decode = math_util.bytes_to_string(decode)
+    target = math_util.bytes_to_string(target)
     dis = 0
     length = 0
     for dec, tar in zip(decode, target):
@@ -59,7 +60,7 @@ def _cer(decode, target):
     return tf.convert_to_tensor(dis, tf.float32), tf.convert_to_tensor(length, tf.float32)
 
 
-def cer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+def cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
     """Character Error Rate
 
     Args:
@@ -69,7 +70,7 @@ def cer(_decode: tf.Tensor, _target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
     Returns:
         tuple: a tuple of tf.Tensor of (edit distances, number of characters) of each text
     """
-    return tf.numpy_function(_cer, inp=[_decode, _target], Tout=[tf.float32, tf.float32])
+    return tf.numpy_function(execute_cer, inp=[decode, target], Tout=[tf.float32, tf.float32])
 
 
 def tf_cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -87,21 +88,3 @@ def tf_cer(decode: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
     distances = tf.edit_distance(decode.to_sparse(), target.to_sparse(), normalize=False)  # [B]
     lengths = tf.cast(target.row_lengths(axis=1), dtype=tf.float32)  # [B]
     return tf.reduce_sum(distances), tf.reduce_sum(lengths)
-
-
-class ErrorRate(tf.keras.metrics.Metric):
-    """ Metric for WER and CER """
-
-    def __init__(self, func, name="error_rate", **kwargs):
-        super(ErrorRate, self).__init__(name=name, **kwargs)
-        self.numerator = self.add_weight(name=f"{name}_numerator", initializer="zeros")
-        self.denominator = self.add_weight(name=f"{name}_denominator", initializer="zeros")
-        self.func = func
-
-    def update_state(self, decode: tf.Tensor, target: tf.Tensor):
-        n, d = self.func(decode, target)
-        self.numerator.assign_add(n)
-        self.denominator.assign_add(d)
-
-    def result(self):
-        return tf.math.divide_no_nan(self.numerator, self.denominator) * 100
diff --git a/tensorflow_asr/utils/shape_util.py b/tensorflow_asr/utils/shape_util.py
new file mode 100644
index 0000000000..d482621f0c
--- /dev/null
+++ b/tensorflow_asr/utils/shape_util.py
@@ -0,0 +1,32 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+
+def shape_list(x, out_type=tf.int32):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x, out_type=out_type)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+
+
+def get_shape_invariants(tensor):
+    shapes = shape_list(tensor)
+    return tf.TensorShape([i if isinstance(i, int) else None for i in shapes])
+
+
+def get_float_spec(tensor):
+    shape = get_shape_invariants(tensor)
+    return tf.TensorSpec(shape, dtype=tf.float32)

From 32970d767fa5e637733ce09af4a955a4d285dac0 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 10 Apr 2021 17:23:01 +0700
Subject: [PATCH 02/13] :rocket: refactor models

---
 tensorflow_asr/losses/__init__.py             |  17 ---
 .../{keras/ctc_losses.py => ctc_loss.py}      |  30 +++-
 tensorflow_asr/losses/ctc_losses.py           |  26 ----
 tensorflow_asr/losses/keras/__init__.py       |  17 ---
 tensorflow_asr/losses/keras/rnnt_losses.py    |  31 ----
 .../losses/{rnnt_losses.py => rnnt_loss.py}   |  31 +++-
 tensorflow_asr/models/base_model.py           | 144 ++++++++++++------
 tensorflow_asr/models/ctc/__init__.py         |   0
 tensorflow_asr/models/{ => ctc}/ctc.py        | 100 ++++++------
 .../models/{ => ctc}/deepspeech2.py           |  81 +++++++---
 tensorflow_asr/models/{ => ctc}/jasper.py     |  96 +++++++++---
 tensorflow_asr/models/transducer/__init__.py  |   0
 .../models/{ => transducer}/conformer.py      |  10 +-
 .../models/{ => transducer}/contextnet.py     |   2 +-
 .../rnn_transducer.py}                        |  22 +--
 .../models/{ => transducer}/transducer.py     |  83 +++++-----
 tensorflow_asr/optimizers/schedules.py        |   8 +-
 tensorflow_asr/utils/data_util.py             |  43 ++++++
 tensorflow_asr/utils/file_util.py             |  21 +++
 19 files changed, 447 insertions(+), 315 deletions(-)
 rename tensorflow_asr/losses/{keras/ctc_losses.py => ctc_loss.py} (58%)
 delete mode 100644 tensorflow_asr/losses/ctc_losses.py
 delete mode 100644 tensorflow_asr/losses/keras/__init__.py
 delete mode 100644 tensorflow_asr/losses/keras/rnnt_losses.py
 rename tensorflow_asr/losses/{rnnt_losses.py => rnnt_loss.py} (93%)
 create mode 100644 tensorflow_asr/models/ctc/__init__.py
 rename tensorflow_asr/models/{ => ctc}/ctc.py (69%)
 rename tensorflow_asr/models/{ => ctc}/deepspeech2.py (84%)
 rename tensorflow_asr/models/{ => ctc}/jasper.py (74%)
 create mode 100644 tensorflow_asr/models/transducer/__init__.py
 rename tensorflow_asr/models/{ => transducer}/conformer.py (98%)
 mode change 100755 => 100644
 rename tensorflow_asr/models/{ => transducer}/contextnet.py (99%)
 rename tensorflow_asr/models/{streaming_transducer.py => transducer/rnn_transducer.py} (96%)
 rename tensorflow_asr/models/{ => transducer}/transducer.py (94%)
 mode change 100755 => 100644
 create mode 100644 tensorflow_asr/utils/data_util.py

diff --git a/tensorflow_asr/losses/__init__.py b/tensorflow_asr/losses/__init__.py
index f9ae63d25d..e69de29bb2 100644
--- a/tensorflow_asr/losses/__init__.py
+++ b/tensorflow_asr/losses/__init__.py
@@ -1,17 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .ctc_losses import ctc_loss
-from .rnnt_losses import rnnt_loss
-__all__ = ['ctc_loss', 'rnnt_loss']
diff --git a/tensorflow_asr/losses/keras/ctc_losses.py b/tensorflow_asr/losses/ctc_loss.py
similarity index 58%
rename from tensorflow_asr/losses/keras/ctc_losses.py
rename to tensorflow_asr/losses/ctc_loss.py
index 9b46fa6670..6808c57b15 100644
--- a/tensorflow_asr/losses/keras/ctc_losses.py
+++ b/tensorflow_asr/losses/ctc_loss.py
@@ -11,9 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import tensorflow as tf
-from .. import ctc_loss
 
 
 class CtcLoss(tf.keras.losses.Loss):
@@ -23,9 +21,27 @@ def __init__(self, blank=0, global_batch_size=None, name=None):
         self.global_batch_size = global_batch_size
 
     def call(self, y_true, y_pred):
-        logits = y_pred["logit"]
-        logit_length = y_pred["logit_length"]
-        labels = y_true["label"]
-        label_length = y_true["label_length"]
-        loss = ctc_loss(labels, logits, logit_length, label_length, blank=self.blank)
+        logits, logits_length = y_pred.values()
+        labels, labels_length = y_true.values()
+        loss = ctc_loss(
+            y_pred=logits,
+            input_length=logits_length,
+            y_true=labels,
+            label_length=labels_length,
+            blank=self.blank,
+            name=self.name
+        )
         return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size)
+
+
+@tf.function
+def ctc_loss(y_true, y_pred, input_length, label_length, blank, name=None):
+    return tf.nn.ctc_loss(
+        labels=tf.cast(y_true, tf.int32),
+        logit_length=tf.cast(input_length, tf.int32),
+        logits=tf.cast(y_pred, tf.float32),
+        label_length=tf.cast(label_length, tf.int32),
+        logits_time_major=False,
+        blank_index=blank,
+        name=name
+    )
diff --git a/tensorflow_asr/losses/ctc_losses.py b/tensorflow_asr/losses/ctc_losses.py
deleted file mode 100644
index d2eccb0ad9..0000000000
--- a/tensorflow_asr/losses/ctc_losses.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import tensorflow as tf
-
-
-@tf.function
-def ctc_loss(y_true, y_pred, input_length, label_length, blank):
-    return tf.nn.ctc_loss(
-        labels=tf.cast(y_true, tf.int32),
-        logit_length=tf.cast(input_length, tf.int32),
-        logits=tf.cast(y_pred, tf.float32),
-        label_length=tf.cast(label_length, tf.int32),
-        logits_time_major=False,
-        blank_index=blank
-    )
diff --git a/tensorflow_asr/losses/keras/__init__.py b/tensorflow_asr/losses/keras/__init__.py
deleted file mode 100644
index 4b667418c3..0000000000
--- a/tensorflow_asr/losses/keras/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .rnnt_losses import RnntLoss
-from .ctc_losses import CtcLoss
-__all__ = ['RnntLoss', 'CtcLoss']
diff --git a/tensorflow_asr/losses/keras/rnnt_losses.py b/tensorflow_asr/losses/keras/rnnt_losses.py
deleted file mode 100644
index 14e0915e55..0000000000
--- a/tensorflow_asr/losses/keras/rnnt_losses.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-from .. import rnnt_loss
-
-
-class RnntLoss(tf.keras.losses.Loss):
-    def __init__(self, blank=0, global_batch_size=None, name=None):
-        super(RnntLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE, name=name)
-        self.blank = blank
-        self.global_batch_size = global_batch_size
-
-    def call(self, y_true, y_pred):
-        logits = y_pred["logit"]
-        logit_length = y_pred["logit_length"]
-        labels = y_true["label"]
-        label_length = y_true["label_length"]
-        loss = rnnt_loss(logits, labels, label_length, logit_length, blank=self.blank)
-        return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size)
diff --git a/tensorflow_asr/losses/rnnt_losses.py b/tensorflow_asr/losses/rnnt_loss.py
similarity index 93%
rename from tensorflow_asr/losses/rnnt_losses.py
rename to tensorflow_asr/losses/rnnt_loss.py
index e8a2486a6e..646ec4586f 100644
--- a/tensorflow_asr/losses/rnnt_losses.py
+++ b/tensorflow_asr/losses/rnnt_loss.py
@@ -15,9 +15,11 @@
 
 import tensorflow as tf
 from tensorflow.python.ops.gen_array_ops import matrix_diag_part_v2
-from ..utils.utils import has_gpu_or_tpu
+from ..utils import env_util
 
-use_cpu = not has_gpu_or_tpu()
+use_cpu = not env_util.has_gpu_or_tpu()
+
+LOG_0 = float("-inf")
 
 try:
     from warprnnt_tensorflow import rnnt_loss as warp_rnnt_loss
@@ -28,6 +30,27 @@
     use_warprnnt = False
 
 
+class RnntLoss(tf.keras.losses.Loss):
+    def __init__(self, blank=0, global_batch_size=None, name=None):
+        super(RnntLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE, name=name)
+        self.blank = blank
+        self.global_batch_size = global_batch_size
+
+    def call(self, y_true, y_pred):
+        logits, logits_length = y_pred.values()
+        labels, labels_length = y_true.values()
+        loss = rnnt_loss(
+            logits=logits,
+            logit_length=logits_length,
+            labels=labels,
+            label_length=labels_length,
+            blank=self.blank,
+            name=self.name
+        )
+        return tf.nn.compute_average_loss(loss, global_batch_size=self.global_batch_size)
+
+
+@tf.function
 def rnnt_loss(logits, labels, label_length, logit_length, blank=0, name=None):
     if use_warprnnt:
         return rnnt_loss_warprnnt(logits=logits, labels=labels,
@@ -36,7 +59,6 @@ def rnnt_loss(logits, labels, label_length, logit_length, blank=0, name=None):
         return rnnt_loss_tf(logits=logits, labels=labels, label_length=label_length, logit_length=logit_length, name=name)
 
 
-@tf.function
 def rnnt_loss_warprnnt(logits, labels, label_length, logit_length, blank=0):
     if not tf.config.list_physical_devices('GPU'):
         logits = tf.nn.log_softmax(logits)
@@ -50,9 +72,6 @@ def rnnt_loss_warprnnt(logits, labels, label_length, logit_length, blank=0):
     return loss
 
 
-LOG_0 = float("-inf")
-
-
 def nan_to_zero(input_tensor):
     return tf.where(tf.math.is_nan(input_tensor), tf.zeros_like(input_tensor), input_tensor)
 
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
index c545577abc..b8378410e2 100644
--- a/tensorflow_asr/models/base_model.py
+++ b/tensorflow_asr/models/base_model.py
@@ -12,64 +12,112 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import abc
-import tempfile
 import tensorflow as tf
+from tensorflow.keras import mixed_precision as mxp
 
-from ..utils import file_util
-
-
-class Model(tf.keras.Model):
-    def __init__(self, name, **kwargs):
-        super(Model, self).__init__(name=name, **kwargs)
-
-    def save(self, filepath, overwrite=True, include_optimizer=True, save_format=None,
-             signatures=None, options=None, save_traces=True):
-        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                super(Model, self).save(
-                    tmp.name, overwrite=overwrite, include_optimizer=include_optimizer,
-                    save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
-                )
-                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
-        else:
-            super(Model, self).save(
-                filepath, overwrite=overwrite, include_optimizer=include_optimizer,
-                save_format=save_format, signatures=signatures, options=options, save_traces=save_traces
+from ..utils import file_util, env_util
+
+
+class BaseModel(tf.keras.Model):
+    def save(self,
+             filepath,
+             overwrite=True,
+             include_optimizer=True,
+             save_format=None,
+             signatures=None,
+             options=None,
+             save_traces=True):
+        with file_util.save_file(filepath) as path:
+            super().save(
+                filepath=path,
+                overwrite=overwrite,
+                include_optimizer=include_optimizer,
+                save_format=save_format,
+                signatures=signatures,
+                options=options,
+                save_traces=save_traces
+            )
+
+    def save_weights(self,
+                     filepath,
+                     overwrite=True,
+                     save_format=None,
+                     options=None):
+        with file_util.save_file(filepath) as path:
+            super().save_weights(
+                filepath=path,
+                overwrite=overwrite,
+                save_format=save_format,
+                options=options
             )
 
-    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
-        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                super(Model, self).save_weights(tmp.name, overwrite=overwrite, save_format=save_format, options=options)
-                tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
-        else:
-            super(Model, self).save_weights(filepath, overwrite=overwrite, save_format=save_format, options=options)
-
-    def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None):
-        if file_util.is_cloud_path(filepath) and file_util.is_hdf5_filepath(filepath):
-            _, ext = os.path.splitext(filepath)
-            with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
-                tf.io.gfile.copy(filepath, tmp.name, overwrite=True)
-                super(Model, self).load_weights(tmp.name, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
-        else:
-            super(Model, self).load_weights(filepath, by_name=by_name, skip_mismatch=skip_mismatch, options=options)
-
-    @abc.abstractmethod
+    def load_weights(self,
+                     filepath,
+                     by_name=False,
+                     skip_mismatch=False,
+                     options=None):
+        with file_util.read_file(filepath) as path:
+            super().load_weights(
+                filepath=path,
+                by_name=by_name,
+                skip_mismatch=skip_mismatch,
+                options=options
+            )
+
+    @property
+    def metrics(self):
+        return [self.loss_metric]
+
     def _build(self, *args, **kwargs):
         raise NotImplementedError()
 
-    @abc.abstractmethod
-    def call(self, inputs, training=False, **kwargs):
-        raise NotImplementedError()
+    def compile(self, loss, optimizer, run_eagerly=None, **kwargs):
+        self.use_loss_scale = False
+        if not env_util.has_tpu():
+            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
+            self.use_loss_scale = True
+        self.loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
+        super().compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
+
+    # -------------------------------- STEP FUNCTIONS -------------------------------------
+
+    def train_step(self, batch):
+        inputs, y_true = batch
+        with tf.GradientTape() as tape:
+            y_pred = self(inputs, training=True)
+            loss = self.loss(y_true, y_pred)
+            if self.use_loss_scale:
+                loss = self.optimizer.get_scaled_loss(loss)
+        gradients = tape.gradient(loss, self.trainable_weights)
+        if self.use_loss_scale:
+            gradients = self.optimizer.get_unscaled_gradients(gradients)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        self.loss_metric.update_state(loss)
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, batch):
+        inputs, y_true = batch
+        y_pred = self(inputs, training=False)
+        loss = self.loss(y_true, y_pred)
+        self.loss_metric.update_state(loss)
+        return {m.name: m.result() for m in self.metrics}
+
+    def predict_step(self, batch):
+        """
+        Args:
+            batch ([tf.Tensor]): a batch of testing data
+
+        Returns:
+            [tf.Tensor]: stacked tensor of shape [B, 3] with each row is the text [truth, greedy, beam_search]
+        """
+        inputs, y_true = batch
+        labels = self.text_featurizer.iextract(y_true)
+        greedy_decoding = self.recognize(inputs)
+        beam_search_decoding = self.recognize_beam(inputs)
+        return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1)
 
-    @abc.abstractmethod
     def recognize(self, features, input_lengths, **kwargs):
         pass
 
-    @abc.abstractmethod
     def recognize_beam(self, features, input_lengths, **kwargs):
         pass
diff --git a/tensorflow_asr/models/ctc/__init__.py b/tensorflow_asr/models/ctc/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/ctc.py b/tensorflow_asr/models/ctc/ctc.py
similarity index 69%
rename from tensorflow_asr/models/ctc.py
rename to tensorflow_asr/models/ctc/ctc.py
index a95949544b..ab0b60da16 100644
--- a/tensorflow_asr/models/ctc.py
+++ b/tensorflow_asr/models/ctc/ctc.py
@@ -12,69 +12,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Optional, Union
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras import mixed_precision as mxp
 
-from . import Model
-from ..featurizers.speech_featurizers import TFSpeechFeaturizer
-from ..featurizers.text_featurizers import TextFeaturizer
-from ..utils import math_util, shape_util
-from ..losses.keras.ctc_losses import CtcLoss
-
-
-class CtcModel(Model):
-    def __init__(self, **kwargs):
-        super(CtcModel, self).__init__(**kwargs)
+from ..base_model import BaseModel
+from ...featurizers.speech_featurizers import TFSpeechFeaturizer
+from ...featurizers.text_featurizers import TextFeaturizer
+from ...utils import math_util, shape_util, data_util
+from ...losses.ctc_loss import CtcLoss
+
+
+class CtcModel(BaseModel):
+    def __init__(self,
+                 encoder: tf.keras.Model,
+                 decoder: Union[tf.keras.Model, tf.keras.layers.Layer] = None,
+                 vocabulary_size: int = None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        if decoder is None:
+            assert vocabulary_size is not None, "vocabulary_size must be set"
+            self.decoder = tf.keras.layers.Dense(units=vocabulary_size, name=f"{self.name}_logits")
+        else:
+            self.decoder = decoder
         self.time_reduction_factor = 1
 
-    def _build(self, input_shape, batch_size=None):
-        features = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32)
-        self(features, training=False)
-
     @property
     def metrics(self):
         return [self.loss_metric]
 
-    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
+    def _build(self, input_shape, batch_size=None):
+        inputs = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32)
+        inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
+        self(
+            data_util.create_inputs(
+                inputs=inputs,
+                inputs_length=inputs_length
+            ),
+            training=False
+        )
+
+    def compile(self,
+                optimizer,
+                global_batch_size,
+                blank=0,
+                run_eagerly=None,
+                **kwargs):
         loss = CtcLoss(blank=blank, global_batch_size=global_batch_size)
-        self.use_loss_scale = use_loss_scale
-        if self.use_loss_scale:
-            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
-        self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32)
-        super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
-
-    def train_step(self, batch):
-        x, y_true = batch
-        with tf.GradientTape() as tape:
-            logit = self(x["input"], training=True)
-            y_pred = {
-                "logit": logit,
-                "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor)
-            }
-            loss = self.loss(y_true, y_pred)
-            if self.use_loss_scale:
-                scaled_loss = self.optimizer.get_scaled_loss(loss)
-        if self.use_loss_scale:
-            scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights)
-            gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
-        else:
-            gradients = tape.gradient(loss, self.trainable_weights)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
-
-    def test_step(self, batch):
-        x, y_true = batch
-        logit = self(x["input"], training=False)
-        y_pred = {
-            "logit": logit,
-            "logit_length": math_util.get_reduced_length(x["input_length"], self.time_reduction_factor)
-        }
-        loss = self.loss(y_true, y_pred)
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
+        super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs)
 
     def add_featurizers(self,
                         speech_featurizer: TFSpeechFeaturizer,
@@ -83,7 +69,13 @@ def add_featurizers(self,
         self.text_featurizer = text_featurizer
 
     def call(self, inputs, training=False, **kwargs):
-        raise NotImplementedError()
+        inputs, inputs_length, _, _ = inputs.values()
+        logits = self.encoder(inputs, training=training, **kwargs)
+        logits = self.decoder(logits, training=training, **kwargs)
+        return data_util.create_logits(
+            logits=logits,
+            logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor)
+        )
 
     # -------------------------------- GREEDY -------------------------------------
 
diff --git a/tensorflow_asr/models/deepspeech2.py b/tensorflow_asr/models/ctc/deepspeech2.py
similarity index 84%
rename from tensorflow_asr/models/deepspeech2.py
rename to tensorflow_asr/models/ctc/deepspeech2.py
index 1e855c5ef3..c8788cbf05 100644
--- a/tensorflow_asr/models/deepspeech2.py
+++ b/tensorflow_asr/models/ctc/deepspeech2.py
@@ -14,9 +14,9 @@
 
 import tensorflow as tf
 
-from ..utils import layer_util, math_util
-from .layers.row_conv_1d import RowConv1D
-from .layers.sequence_wise_bn import SequenceBatchNorm
+from ...utils import layer_util, math_util
+from ..layers.row_conv_1d import RowConv1D
+from ..layers.sequence_wise_bn import SequenceBatchNorm
 from .ctc import CtcModel
 
 
@@ -210,7 +210,6 @@ def get_config(self):
 
 class FcModule(tf.keras.Model):
     def __init__(self,
-                 vocabulary_size: int,
                  nlayers: int = 0,
                  units: int = 1024,
                  dropout: float = 0.1,
@@ -225,28 +224,21 @@ def __init__(self,
             ) for i in range(nlayers)
         ]
 
-        # Fully connected layer
-        self.fc = tf.keras.layers.Dense(units=vocabulary_size,
-                                        use_bias=True, name=f"{self.name}_fc")
-
     def call(self, inputs, training=False, **kwargs):
         outputs = inputs
         for block in self.blocks:
             outputs = block(outputs, training=training, **kwargs)
-        outputs = self.fc(outputs, training=training)
         return outputs
 
     def get_config(self):
         conf = {}
         for block in self.blocks:
             conf.update(block.get_config())
-        conf.update(self.fc.get_config())
         return conf
 
 
-class DeepSpeech2(CtcModel):
+class DeepSpeech2Encoder(tf.keras.Model):
     def __init__(self,
-                 vocabulary_size: int,
                  conv_type: str = "conv2d",
                  conv_kernels: list = [[11, 41], [11, 21], [11, 21]],
                  conv_strides: list = [[2, 2], [1, 2], [1, 2]],
@@ -261,9 +253,9 @@ def __init__(self,
                  fc_nlayers: int = 0,
                  fc_units: int = 1024,
                  fc_dropout: float = 0.1,
-                 name: str = "deepspeech2",
+                 name="deepspeech2_encoder",
                  **kwargs):
-        super(DeepSpeech2, self).__init__(name=name, **kwargs)
+        super().__init__(**kwargs)
 
         self.conv_module = ConvModule(
             conv_type=conv_type,
@@ -288,27 +280,68 @@ def __init__(self,
             nlayers=fc_nlayers,
             units=fc_units,
             dropout=fc_dropout,
-            vocabulary_size=vocabulary_size,
             name=f"{self.name}_fc_module"
         )
 
-        self.time_reduction_factor = self.conv_module.reduction_factor
+    def summary(self, line_length=100, **kwargs):
+        self.conv_module.summary(line_length=line_length, **kwargs)
+        self.rnn_module.summary(line_length=line_length, **kwargs)
+        self.fc_module.summary(line_length=line_length, **kwargs)
+        super().summary(line_length=line_length, **kwargs)
 
-    def call(self, inputs, training=False, **kwargs):
+    def call(self, inputs, training, **kwargs):
         outputs = self.conv_module(inputs, training=training, **kwargs)
         outputs = self.rnn_module(outputs, training=training, **kwargs)
         outputs = self.fc_module(outputs, training=training, **kwargs)
         return outputs
 
-    def summary(self, line_length=100, **kwargs):
-        self.conv_module.summary(line_length=line_length, **kwargs)
-        self.rnn_module.summary(line_length=line_length, **kwargs)
-        self.fc_module.summary(line_length=line_length, **kwargs)
-        super(DeepSpeech2, self).summary(line_length=line_length, **kwargs)
-
     def get_config(self):
-        conf = super(DeepSpeech2, self).get_config()
+        conf = super().get_config()
         conf.update(self.conv_module.get_config())
         conf.update(self.rnn_module.get_config())
         conf.update(self.fc_module.get_config())
         return conf
+
+
+class DeepSpeech2(CtcModel):
+    def __init__(self,
+                 vocabulary_size: int,
+                 conv_type: str = "conv2d",
+                 conv_kernels: list = [[11, 41], [11, 21], [11, 21]],
+                 conv_strides: list = [[2, 2], [1, 2], [1, 2]],
+                 conv_filters: list = [32, 32, 96],
+                 conv_dropout: float = 0.1,
+                 rnn_nlayers: int = 5,
+                 rnn_type: str = "lstm",
+                 rnn_units: int = 1024,
+                 rnn_bidirectional: bool = True,
+                 rnn_rowconv: int = 0,
+                 rnn_dropout: float = 0.1,
+                 fc_nlayers: int = 0,
+                 fc_units: int = 1024,
+                 fc_dropout: float = 0.1,
+                 name: str = "deepspeech2",
+                 **kwargs):
+        super().__init__(
+            encoder=DeepSpeech2Encoder(
+                conv_type=conv_type,
+                conv_kernels=conv_kernels,
+                conv_strides=conv_strides,
+                conv_filters=conv_filters,
+                conv_dropout=conv_dropout,
+                rnn_nlayers=rnn_nlayers,
+                rnn_type=rnn_type,
+                rnn_units=rnn_units,
+                rnn_bidirectional=rnn_bidirectional,
+                rnn_rowconv=rnn_rowconv,
+                rnn_dropout=rnn_dropout,
+                fc_nlayers=fc_nlayers,
+                fc_units=fc_units,
+                fc_dropout=fc_dropout,
+                name=f"{name}_encoder"
+            ),
+            vocabulary_size=vocabulary_size,
+            name=name,
+            **kwargs
+        )
+        self.time_reduction_factor = self.encoder.conv_module.reduction_factor
diff --git a/tensorflow_asr/models/jasper.py b/tensorflow_asr/models/ctc/jasper.py
similarity index 74%
rename from tensorflow_asr/models/jasper.py
rename to tensorflow_asr/models/ctc/jasper.py
index a8b0780403..963391a7bb 100644
--- a/tensorflow_asr/models/jasper.py
+++ b/tensorflow_asr/models/ctc/jasper.py
@@ -14,7 +14,7 @@
 
 import tensorflow as tf
 
-from ..utils import math_util
+from ...utils import math_util
 from .ctc import CtcModel
 
 
@@ -195,9 +195,8 @@ def get_config(self):
         return conf
 
 
-class Jasper(CtcModel):
+class JasperEncoder(tf.keras.Model):
     def __init__(self,
-                 vocabulary_size: int,
                  dense: bool = False,
                  first_additional_block_channels: int = 256,
                  first_additional_block_kernels: int = 11,
@@ -220,9 +219,9 @@ def __init__(self,
                  third_additional_block_dropout: int = 0.4,
                  kernel_regularizer=None,
                  bias_regularizer=None,
-                 name: str = "jasper",
+                 name: str = "jasper_encoder",
                  **kwargs):
-        super(Jasper, self).__init__(name=name, **kwargs)
+        super().__init__(name=name, **kwargs)
 
         assert len(block_channels) == len(block_kernels) == len(block_dropout)
 
@@ -275,18 +274,6 @@ def __init__(self,
             name=f"{self.name}_third_block"
         )
 
-        self.last_block = tf.keras.layers.Conv1D(
-            filters=vocabulary_size, kernel_size=1,
-            strides=1, padding="same",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=f"{self.name}_last_block"
-        )
-
-        self.time_reduction_factor = self.first_additional_block.reduction_factor
-        self.time_reduction_factor *= self.second_additional_block.reduction_factor
-        self.time_reduction_factor *= self.third_additional_block.reduction_factor
-
     def call(self, inputs, training=False, **kwargs):
         outputs = self.reshape(inputs)
         outputs = self.first_additional_block(outputs, training=training, **kwargs)
@@ -297,18 +284,85 @@ def call(self, inputs, training=False, **kwargs):
 
         outputs = self.second_additional_block(outputs, training=training, **kwargs)
         outputs = self.third_additional_block(outputs, training=training, **kwargs)
-        outputs = self.last_block(outputs, training=training, **kwargs)
         return outputs
 
     def summary(self, line_length=100, **kwargs):
-        super(Jasper, self).summary(line_length=line_length, **kwargs)
+        super().summary(line_length=line_length, **kwargs)
 
     def get_config(self):
-        conf = self.reshape.get_config()
+        conf = super().get_config()
+        conf.update(self.reshape.get_config())
         conf.update(self.first_additional_block.get_config())
         for block in self.blocks:
             conf.update(block.get_config())
         conf.update(self.second_additional_block.get_config())
         conf.update(self.third_additional_block.get_config())
-        conf.update(self.last_block.get_config())
         return conf
+
+
+class Jasper(CtcModel):
+    def __init__(self,
+                 vocabulary_size: int,
+                 dense: bool = False,
+                 first_additional_block_channels: int = 256,
+                 first_additional_block_kernels: int = 11,
+                 first_additional_block_strides: int = 2,
+                 first_additional_block_dilation: int = 1,
+                 first_additional_block_dropout: int = 0.2,
+                 nsubblocks: int = 5,
+                 block_channels: list = [256, 384, 512, 640, 768],
+                 block_kernels: list = [11, 13, 17, 21, 25],
+                 block_dropout: list = [0.2, 0.2, 0.2, 0.3, 0.3],
+                 second_additional_block_channels: int = 896,
+                 second_additional_block_kernels: int = 1,
+                 second_additional_block_strides: int = 1,
+                 second_additional_block_dilation: int = 2,
+                 second_additional_block_dropout: int = 0.4,
+                 third_additional_block_channels: int = 1024,
+                 third_additional_block_kernels: int = 1,
+                 third_additional_block_strides: int = 1,
+                 third_additional_block_dilation: int = 1,
+                 third_additional_block_dropout: int = 0.4,
+                 kernel_regularizer=None,
+                 bias_regularizer=None,
+                 name="jasper",
+                 **kwargs):
+        super().__init__(
+            encoder=JasperEncoder(
+                dense=dense,
+                first_additional_block_channels=first_additional_block_channels,
+                first_additional_block_kernels=first_additional_block_kernels,
+                first_additional_block_strides=first_additional_block_strides,
+                first_additional_block_dilation=first_additional_block_dilation,
+                first_additional_block_dropout=first_additional_block_dropout,
+                nsubblocks=nsubblocks,
+                block_channels=block_channels,
+                block_kernels=block_kernels,
+                block_dropout=block_dropout,
+                second_additional_block_channels=second_additional_block_channels,
+                second_additional_block_kernels=second_additional_block_kernels,
+                second_additional_block_strides=second_additional_block_strides,
+                second_additional_block_dilation=second_additional_block_dilation,
+                second_additional_block_dropout=second_additional_block_dropout,
+                third_additional_block_channels=third_additional_block_channels,
+                third_additional_block_kernels=third_additional_block_kernels,
+                third_additional_block_strides=third_additional_block_strides,
+                third_additional_block_dilation=third_additional_block_dilation,
+                third_additional_block_dropout=third_additional_block_dropout,
+                kernel_regularizer=None,
+                bias_regularizer=None,
+            ),
+            decoder=tf.keras.layers.Conv1D(
+                filters=vocabulary_size, kernel_size=1,
+                strides=1, padding="same",
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer,
+                name=f"{self.name}_logits"
+            ),
+            vocabulary_size=vocabulary_size,
+            name=name,
+            **kwargs
+        )
+        self.time_reduction_factor = self.encoder.first_additional_block.reduction_factor
+        self.time_reduction_factor *= self.encoder.second_additional_block.reduction_factor
+        self.time_reduction_factor *= self.encoder.third_additional_block.reduction_factor
diff --git a/tensorflow_asr/models/transducer/__init__.py b/tensorflow_asr/models/transducer/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/conformer.py b/tensorflow_asr/models/transducer/conformer.py
old mode 100755
new mode 100644
similarity index 98%
rename from tensorflow_asr/models/conformer.py
rename to tensorflow_asr/models/transducer/conformer.py
index a13dfa1d19..f66197d972
--- a/tensorflow_asr/models/conformer.py
+++ b/tensorflow_asr/models/transducer/conformer.py
@@ -14,12 +14,12 @@
 
 import tensorflow as tf
 
-from .activations.glu import GLU
+from ..activations.glu import GLU
 from .transducer import Transducer
-from .layers.subsampling import VggSubsampling, Conv2dSubsampling
-from .layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
-from .layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
-from ..utils import shape_util
+from ..layers.subsampling import VggSubsampling, Conv2dSubsampling
+from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
+from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
+from ...utils import shape_util
 
 L2 = tf.keras.regularizers.l2(1e-6)
 
diff --git a/tensorflow_asr/models/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
similarity index 99%
rename from tensorflow_asr/models/contextnet.py
rename to tensorflow_asr/models/transducer/contextnet.py
index 636560101d..dac9e9050d 100644
--- a/tensorflow_asr/models/contextnet.py
+++ b/tensorflow_asr/models/transducer/contextnet.py
@@ -16,7 +16,7 @@
 from typing import List
 import tensorflow as tf
 from .transducer import Transducer
-from ..utils import math_util
+from ...utils import math_util
 
 L2 = tf.keras.regularizers.l2(1e-6)
 
diff --git a/tensorflow_asr/models/streaming_transducer.py b/tensorflow_asr/models/transducer/rnn_transducer.py
similarity index 96%
rename from tensorflow_asr/models/streaming_transducer.py
rename to tensorflow_asr/models/transducer/rnn_transducer.py
index ba793126e2..88ef18d80c 100644
--- a/tensorflow_asr/models/streaming_transducer.py
+++ b/tensorflow_asr/models/transducer/rnn_transducer.py
@@ -15,16 +15,16 @@
 
 import tensorflow as tf
 
-from .layers.subsampling import TimeReduction
+from ..layers.subsampling import TimeReduction
 from .transducer import Transducer
-from ..utils import layer_util, math_util, shape_util
+from ...utils import layer_util, math_util, shape_util
 
 
 class Reshape(tf.keras.layers.Layer):
     def call(self, inputs): return math_util.merge_two_last_dims(inputs)
 
 
-class StreamingTransducerBlock(tf.keras.Model):
+class RnnTransducerBlock(tf.keras.Model):
     def __init__(self,
                  reduction_factor: int = 0,
                  dmodel: int = 640,
@@ -34,7 +34,7 @@ def __init__(self,
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  **kwargs):
-        super(StreamingTransducerBlock, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         if reduction_factor > 0:
             self.reduction = TimeReduction(reduction_factor, name=f"{self.name}_reduction")
@@ -94,7 +94,7 @@ def get_config(self):
         return conf
 
 
-class StreamingTransducerEncoder(tf.keras.Model):
+class RnnTransducerEncoder(tf.keras.Model):
     def __init__(self,
                  reductions: dict = {0: 3, 1: 2},
                  dmodel: int = 640,
@@ -105,12 +105,12 @@ def __init__(self,
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  **kwargs):
-        super(StreamingTransducerEncoder, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         self.reshape = Reshape(name=f"{self.name}_reshape")
 
         self.blocks = [
-            StreamingTransducerBlock(
+            RnnTransducerBlock(
                 reduction_factor=reductions.get(i, 0),  # key is index, value is the factor
                 dmodel=dmodel,
                 rnn_type=rnn_type,
@@ -174,7 +174,7 @@ def get_config(self):
         return conf
 
 
-class StreamingTransducer(Transducer):
+class RnnTransducer(Transducer):
     def __init__(self,
                  vocabulary_size: int,
                  encoder_reductions: dict = {0: 3, 1: 2},
@@ -200,10 +200,10 @@ def __init__(self,
                  joint_trainable: bool = True,
                  kernel_regularizer = None,
                  bias_regularizer = None,
-                 name = "StreamingTransducer",
+                 name = "RnnTransducer",
                  **kwargs):
-        super(StreamingTransducer, self).__init__(
-            encoder=StreamingTransducerEncoder(
+        super().__init__(
+            encoder=RnnTransducerEncoder(
                 reductions=encoder_reductions,
                 dmodel=encoder_dmodel,
                 nlayers=encoder_nlayers,
diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer/transducer.py
old mode 100755
new mode 100644
similarity index 94%
rename from tensorflow_asr/models/transducer.py
rename to tensorflow_asr/models/transducer/transducer.py
index efd3c4d55e..8917bf5a3b
--- a/tensorflow_asr/models/transducer.py
+++ b/tensorflow_asr/models/transducer/transducer.py
@@ -15,14 +15,13 @@
 
 import collections
 import tensorflow as tf
-from tensorflow.keras import mixed_precision as mxp
 
-from . import Model
-from ..utils import math_util, layer_util, shape_util
-from ..featurizers.speech_featurizers import SpeechFeaturizer
-from ..featurizers.text_featurizers import TextFeaturizer
-from .layers.embedding import Embedding
-from ..losses.keras.rnnt_losses import RnntLoss
+from ..base_model import BaseModel
+from ...utils import math_util, layer_util, shape_util, data_util
+from ...featurizers.speech_featurizers import SpeechFeaturizer
+from ...featurizers.text_featurizers import TextFeaturizer
+from ..layers.embedding import Embedding
+from ...losses.rnnt_loss import RnntLoss
 
 Hypothesis = collections.namedtuple("Hypothesis", ("index", "prediction", "states"))
 
@@ -44,7 +43,7 @@ def __init__(self,
                  bias_regularizer=None,
                  name="transducer_prediction",
                  **kwargs):
-        super(TransducerPrediction, self).__init__(name=name, **kwargs)
+        super().__init__(name=name, **kwargs)
         self.embed = Embedding(vocabulary_size, embed_dim,
                                regularizer=kernel_regularizer, name=f"{name}_embedding")
         self.do = tf.keras.layers.Dropout(embed_dropout, name=f"{name}_dropout")
@@ -148,7 +147,7 @@ def __init__(self,
                  axis: int = 1,
                  name="transducer_joint_reshape",
                  **kwargs):
-        super(TransducerJointReshape, self).__init__(name=name, trainable=False, **kwargs)
+        super().__init__(name=name, trainable=False, **kwargs)
         self.axis = axis
 
     def call(self, inputs, repeats=None, **kwargs):
@@ -173,7 +172,7 @@ def __init__(self,
                  bias_regularizer=None,
                  name="tranducer_joint",
                  **kwargs):
-        super(TransducerJoint, self).__init__(name=name, **kwargs)
+        super().__init__(name=name, **kwargs)
 
         activation = activation.lower()
         if activation == "linear":
@@ -248,7 +247,7 @@ def get_config(self):
         return conf
 
 
-class Transducer(Model):
+class Transducer(BaseModel):
     """ Transducer Model Warper """
 
     def __init__(self,
@@ -273,7 +272,7 @@ def __init__(self,
                  bias_regularizer=None,
                  name="transducer",
                  **kwargs):
-        super(Transducer, self).__init__(name=name, **kwargs)
+        super().__init__(name=name, **kwargs)
         self.encoder = encoder
         self.predict_net = TransducerPrediction(
             vocabulary_size=vocabulary_size,
@@ -304,21 +303,20 @@ def __init__(self,
         )
         self.time_reduction_factor = 1
 
-    @property
-    def metrics(self):
-        return [self.loss_metric]
-
     def _build(self, input_shape, prediction_shape=[None], batch_size=None):
         inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32)
-        input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
-        pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        self({
-            "input": inputs,
-            "input_length": input_length,
-            "prediction": pred,
-            "prediction_length": pred_length
-        }, training=False)
+        inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
+        predictions = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
+        predictions_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
+        self(
+            data_util.create_inputs(
+                inputs=inputs,
+                inputs_length=inputs_length,
+                predictions=predictions,
+                predictions_length=predictions_length
+            ),
+            training=False
+        )
 
     def summary(self, line_length=None, **kwargs):
         if self.encoder is not None: self.encoder.summary(line_length=line_length, **kwargs)
@@ -339,27 +337,26 @@ def add_featurizers(self,
         self.speech_featurizer = speech_featurizer
         self.text_featurizer = text_featurizer
 
-    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
+    def compile(self,
+                optimizer,
+                global_batch_size,
+                blank=0,
+                run_eagerly=None,
+                **kwargs):
         loss = RnntLoss(blank=blank, global_batch_size=global_batch_size)
-        self.use_loss_scale = use_loss_scale
-        if self.use_loss_scale:
-            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
-        self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32)
-        super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
+        super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs)
 
     def call(self, inputs, training=False, **kwargs):
-        features = inputs["input"]
-        prediction = inputs["prediction"]
-        prediction_length = inputs["prediction_length"]
-        enc = self.encoder(features, training=training, **kwargs)
-        pred = self.predict_net([prediction, prediction_length], training=training, **kwargs)
-        outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return {
-            "logit": outputs,
-            "logit_length": math_util.get_reduced_length(inputs["input_length"], self.time_reduction_factor)
-        }
-
-    # -------------------------------- INFERENCES-------------------------------------
+        inputs, inputs_length, predictions, predictions_length = inputs.values()
+        enc = self.encoder(inputs, training=training, **kwargs)
+        pred = self.predict_net([predictions, predictions_length], training=training, **kwargs)
+        logits = self.joint_net([enc, pred], training=training, **kwargs)
+        return data_util.create_logits(
+            logits=logits,
+            logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor)
+        )
+
+    # -------------------------------- INFERENCES -------------------------------------
 
     def encoder_inference(self, features: tf.Tensor):
         """Infer function for encoder (or encoders)
diff --git a/tensorflow_asr/optimizers/schedules.py b/tensorflow_asr/optimizers/schedules.py
index 1edd8003e9..ec8d151774 100755
--- a/tensorflow_asr/optimizers/schedules.py
+++ b/tensorflow_asr/optimizers/schedules.py
@@ -103,13 +103,14 @@ class CyclicTransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedu
         step_size: number of training iterations per
             half cycle. Authors suggest setting step_size
             2-8 x training iterations in epoch.
-    
+
     It is inspired from the paper:
     # References
       - [Cyclical Learning Rates for Training Neural Networks](
       https://arxiv.org/abs/1506.01186)
     """
-    def __init__(self, d_model, warmup_steps=4000, max_lr=None, 
+
+    def __init__(self, d_model, warmup_steps=4000, max_lr=None,
                  step_size=None):
         """Applies triangular cyclic to the square root decay learning rate.
         Args:
@@ -134,7 +135,7 @@ def __call__(self, step):
         cycle = tf.math.floor(1 + step / (2 * self.step_size))
         x = tf.math.abs(step / self.step_size - 2 * cycle + 1)
         lr = lr * (0.5 + tf.math.maximum(0., x))
-        lr = tf.math.minimum(self.max_lr, 
+        lr = tf.math.minimum(self.max_lr,
                              tf.math.minimum(lr, warmup))
         return lr
 
@@ -145,4 +146,3 @@ def get_config(self):
             "max_lr": self.max_lr,
             "step_size": self.step_size
         }
-    
\ No newline at end of file
diff --git a/tensorflow_asr/utils/data_util.py b/tensorflow_asr/utils/data_util.py
new file mode 100644
index 0000000000..324c720d49
--- /dev/null
+++ b/tensorflow_asr/utils/data_util.py
@@ -0,0 +1,43 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tf.data.Dataset does not work well for namedtuple so we are using dict
+
+import tensorflow as tf
+
+
+def create_inputs(inputs: tf.Tensor,
+                  inputs_length: tf.Tensor,
+                  predictions: tf.Tensor = None,
+                  predictions_length: tf.Tensor = None) -> dict:
+    return {
+        "inputs": inputs,
+        "inputs_length": inputs_length,
+        "predictions": predictions,
+        "predictions_length": predictions_length
+    }
+
+
+def create_logits(logits: tf.Tensor, logits_length: tf.Tensor) -> dict:
+    return {
+        "logits": logits,
+        "logits_length": logits_length
+    }
+
+
+def create_labels(labels: tf.Tensor, labels_length: tf.Tensor) -> dict:
+    return {
+        "labels": labels,
+        "labels_length": labels_length,
+    }
diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py
index c9d1c867d0..0d69315c87 100644
--- a/tensorflow_asr/utils/file_util.py
+++ b/tensorflow_asr/utils/file_util.py
@@ -14,6 +14,7 @@
 
 import os
 import re
+import tempfile
 from typing import Union, List
 import tensorflow as tf
 
@@ -55,3 +56,23 @@ def read_bytes(path: str) -> tf.Tensor:
     with tf.io.gfile.GFile(path, "rb") as f:
         content = f.read()
     return tf.convert_to_tensor(content, dtype=tf.string)
+
+
+def save_file(filepath):
+    if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
+        _, ext = os.path.splitext(filepath)
+        with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
+            yield tmp.name
+            tf.io.gfile.copy(tmp.name, filepath, overwrite=True)
+    else:
+        yield filepath
+
+
+def read_file(filepath):
+    if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
+        _, ext = os.path.splitext(filepath)
+        with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
+            tf.io.gfile.copy(filepath, tmp.name, overwrite=True)
+            yield tmp.name
+    else:
+        yield filepath

From 65f222d31005fdc5a45e22e0ee847f419c92849c Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Tue, 13 Apr 2021 01:07:40 +0700
Subject: [PATCH 03/13] :rocket: refactor featurizers, datasets, augmentations

---
 tensorflow_asr/augmentations/augmentation.py  |  58 ++
 tensorflow_asr/augmentations/augments.py      |  86 ---
 .../augmentations/methods/__init__.py         |   0
 .../methods/base_method.py}                   |  10 +-
 .../augmentations/methods/specaugment.py      |  75 +++
 .../augmentations/signal_augment.py           | 101 ---
 tensorflow_asr/augmentations/spec_augment.py  | 203 ------
 tensorflow_asr/configs/config.py              |  16 +-
 tensorflow_asr/datasets/asr_dataset.py        |  79 ++-
 tensorflow_asr/datasets/keras/__init__.py     |  16 -
 tensorflow_asr/datasets/keras/asr_dataset.py  | 176 ------
 .../featurizers/methods/__init__.py           |   0
 .../featurizers/{ => methods}/gammatone.py    |   0
 .../featurizers/speech_featurizers.py         |  36 +-
 .../featurizers/text_featurizers.py           | 117 +---
 tensorflow_asr/featurizers/wordpiece.py       | 577 ------------------
 tensorflow_asr/models/keras/conformer.py      |  93 ---
 tensorflow_asr/models/keras/contextnet.py     | 181 ------
 tensorflow_asr/models/keras/ctc.py            |  66 --
 tensorflow_asr/models/keras/deepspeech2.py    |  86 ---
 tensorflow_asr/models/keras/jasper.py         | 137 -----
 .../models/keras/streaming_transducer.py      | 201 ------
 tensorflow_asr/models/keras/transducer.py     |  93 ---
 tensorflow_asr/runners/README.md              |  24 -
 tensorflow_asr/runners/__init__.py            |  42 --
 tensorflow_asr/runners/base_runners.py        | 498 ---------------
 tensorflow_asr/runners/ctc_runners.py         | 139 -----
 tensorflow_asr/runners/transducer_runners.py  | 136 -----
 28 files changed, 224 insertions(+), 3022 deletions(-)
 create mode 100644 tensorflow_asr/augmentations/augmentation.py
 delete mode 100755 tensorflow_asr/augmentations/augments.py
 create mode 100644 tensorflow_asr/augmentations/methods/__init__.py
 rename tensorflow_asr/{models/keras/__init__.py => augmentations/methods/base_method.py} (80%)
 create mode 100644 tensorflow_asr/augmentations/methods/specaugment.py
 delete mode 100644 tensorflow_asr/augmentations/signal_augment.py
 delete mode 100755 tensorflow_asr/augmentations/spec_augment.py
 delete mode 100644 tensorflow_asr/datasets/keras/__init__.py
 delete mode 100644 tensorflow_asr/datasets/keras/asr_dataset.py
 create mode 100644 tensorflow_asr/featurizers/methods/__init__.py
 rename tensorflow_asr/featurizers/{ => methods}/gammatone.py (100%)
 delete mode 100644 tensorflow_asr/featurizers/wordpiece.py
 delete mode 100644 tensorflow_asr/models/keras/conformer.py
 delete mode 100644 tensorflow_asr/models/keras/contextnet.py
 delete mode 100644 tensorflow_asr/models/keras/ctc.py
 delete mode 100644 tensorflow_asr/models/keras/deepspeech2.py
 delete mode 100644 tensorflow_asr/models/keras/jasper.py
 delete mode 100644 tensorflow_asr/models/keras/streaming_transducer.py
 delete mode 100644 tensorflow_asr/models/keras/transducer.py
 delete mode 100644 tensorflow_asr/runners/README.md
 delete mode 100644 tensorflow_asr/runners/__init__.py
 delete mode 100644 tensorflow_asr/runners/base_runners.py
 delete mode 100644 tensorflow_asr/runners/ctc_runners.py
 delete mode 100644 tensorflow_asr/runners/transducer_runners.py

diff --git a/tensorflow_asr/augmentations/augmentation.py b/tensorflow_asr/augmentations/augmentation.py
new file mode 100644
index 0000000000..314a6488b6
--- /dev/null
+++ b/tensorflow_asr/augmentations/augmentation.py
@@ -0,0 +1,58 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+from .methods import specaugment
+
+
+AUGMENTATIONS = {
+    "freq_masking": specaugment.FreqMasking,
+    "time_masking": specaugment.TimeMasking,
+}
+
+
+class Augmentation:
+    def __init__(self, config: dict = None):
+        if not config: config = {}
+        self.prob = float(config.pop("prob", 0.5))
+        self.before = self.parse(config.pop("before", {}))
+        self.after = self.parse(config.pop("after", {}))
+
+    def _augment(self, inputs, augmentations):
+        outputs = inputs
+        for au in augmentations:
+            p = tf.random.uniform([])
+            outputs = tf.where(tf.less(p, self.prob), au.augment(outputs), outputs)
+        return outputs
+
+    @tf.function
+    def signal_augment(self, inputs):
+        return self._augment(inputs, self.before)
+
+    @tf.function
+    def feature_augment(self, inputs):
+        return self._augment(inputs, self.after)
+
+    @staticmethod
+    def parse(config: dict) -> list:
+        augmentations = []
+        for key, value in config.items():
+            au = AUGMENTATIONS.get(key, None)
+            if au is None:
+                raise KeyError(f"No tf augmentation named: {key}\n"
+                               f"Available tf augmentations: {AUGMENTATIONS.keys()}")
+            aug = au(**value) if value is not None else au()
+            augmentations.append(aug)
+        return augmentations
diff --git a/tensorflow_asr/augmentations/augments.py b/tensorflow_asr/augmentations/augments.py
deleted file mode 100755
index 24a59841de..0000000000
--- a/tensorflow_asr/augmentations/augments.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-import nlpaug.flow as naf
-
-from .signal_augment import SignalCropping, SignalLoudness, SignalMask, SignalNoise, \
-    SignalPitch, SignalShift, SignalSpeed, SignalVtlp
-from .spec_augment import FreqMasking, TimeMasking, TFFreqMasking, TFTimeMasking
-
-
-AUGMENTATIONS = {
-    "freq_masking": FreqMasking,
-    "time_masking": TimeMasking,
-    "noise": SignalNoise,
-    "masking": SignalMask,
-    "cropping": SignalCropping,
-    "loudness": SignalLoudness,
-    "pitch": SignalPitch,
-    "shift": SignalShift,
-    "speed": SignalSpeed,
-    "vtlp": SignalVtlp
-}
-
-TFAUGMENTATIONS = {
-    "freq_masking": TFFreqMasking,
-    "time_masking": TFTimeMasking,
-}
-
-
-class TFAugmentationExecutor:
-    def __init__(self, augmentations: list, prob: float = 0.5):
-        self.augmentations = augmentations
-        self.prob = prob
-
-    @tf.function
-    def augment(self, inputs):
-        outputs = inputs
-        for au in self.augmentations:
-            p = tf.random.uniform([])
-            outputs = tf.where(tf.less(p, self.prob), au.augment(outputs), outputs)
-        return outputs
-
-
-class Augmentation:
-    def __init__(self, config: dict = None, use_tf: bool = False):
-        if not config: config = {}
-        prob = float(config.pop("prob", 0.5))
-        parser = self.tf_parse if use_tf else self.parse
-        self.before = parser(config.pop("before", {}), prob=prob)
-        self.after = parser(config.pop("after", {}), prob=prob)
-
-    @staticmethod
-    def parse(config: dict, prob: float = 0.5) -> naf.Sometimes:
-        augmentations = []
-        for key, value in config.items():
-            au = AUGMENTATIONS.get(key, None)
-            if au is None:
-                raise KeyError(f"No augmentation named: {key}\n"
-                               f"Available augmentations: {AUGMENTATIONS.keys()}")
-            aug = au(**value) if value is not None else au()
-            augmentations.append(aug)
-        return naf.Sometimes(augmentations, pipeline_p=prob)
-
-    @staticmethod
-    def tf_parse(config: dict, prob: float = 0.5) -> TFAugmentationExecutor:
-        augmentations = []
-        for key, value in config.items():
-            au = TFAUGMENTATIONS.get(key, None)
-            if au is None:
-                raise KeyError(f"No tf augmentation named: {key}\n"
-                               f"Available tf augmentations: {TFAUGMENTATIONS.keys()}")
-            aug = au(**value) if value is not None else au()
-            augmentations.append(aug)
-        return TFAugmentationExecutor(augmentations, prob=prob)
diff --git a/tensorflow_asr/augmentations/methods/__init__.py b/tensorflow_asr/augmentations/methods/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/keras/__init__.py b/tensorflow_asr/augmentations/methods/base_method.py
similarity index 80%
rename from tensorflow_asr/models/keras/__init__.py
rename to tensorflow_asr/augmentations/methods/base_method.py
index c494840752..6cc9c0f759 100644
--- a/tensorflow_asr/models/keras/__init__.py
+++ b/tensorflow_asr/augmentations/methods/base_method.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .transducer import Transducer
-from .conformer import Conformer
-__all__ = ['Transducer', 'Conformer']
+import tensorflow as tf
+
+
+class AugmentationMethod:
+    @tf.function
+    def augment(self, *args, **kwargs):
+        raise NotImplementedError()
diff --git a/tensorflow_asr/augmentations/methods/specaugment.py b/tensorflow_asr/augmentations/methods/specaugment.py
new file mode 100644
index 0000000000..b948b6644e
--- /dev/null
+++ b/tensorflow_asr/augmentations/methods/specaugment.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+from ...utils import shape_util
+from .base_method import AugmentationMethod
+
+
+class FreqMasking(AugmentationMethod):
+    def __init__(self, num_masks: int = 1, mask_factor: float = 27):
+        self.num_masks = num_masks
+        self.mask_factor = mask_factor
+
+    @tf.function
+    def augment(self, spectrogram: tf.Tensor):
+        """
+        Masking the frequency channels (shape[1])
+        Args:
+            spectrogram: shape (T, num_feature_bins, V)
+        Returns:
+            frequency masked spectrogram
+        """
+        T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32)
+        for _ in range(self.num_masks):
+            f = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32)
+            f = tf.minimum(f, F)
+            f0 = tf.random.uniform([], minval=0, maxval=(F - f), dtype=tf.int32)
+            mask = tf.concat([
+                tf.ones([T, f0, V], dtype=spectrogram.dtype),
+                tf.zeros([T, f, V], dtype=spectrogram.dtype),
+                tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype)
+            ], axis=1)
+            spectrogram = spectrogram * mask
+        return spectrogram
+
+
+class TimeMasking(AugmentationMethod):
+    def __init__(self, num_masks: int = 1, mask_factor: float = 100, p_upperbound: float = 1.0):
+        self.num_masks = num_masks
+        self.mask_factor = mask_factor
+        self.p_upperbound = p_upperbound
+
+    @tf.function
+    def augment(self, spectrogram: tf.Tensor):
+        """
+        Masking the time channel (shape[0])
+        Args:
+            spectrogram: shape (T, num_feature_bins, V)
+        Returns:
+            frequency masked spectrogram
+        """
+        T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32)
+        for _ in range(self.num_masks):
+            t = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32)
+            t = tf.minimum(t, tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound, dtype=tf.int32))
+            t0 = tf.random.uniform([], minval=0, maxval=(T - t), dtype=tf.int32)
+            mask = tf.concat([
+                tf.ones([t0, F, V], dtype=spectrogram.dtype),
+                tf.zeros([t, F, V], dtype=spectrogram.dtype),
+                tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype)
+            ], axis=0)
+            spectrogram = spectrogram * mask
+        return spectrogram
diff --git a/tensorflow_asr/augmentations/signal_augment.py b/tensorflow_asr/augmentations/signal_augment.py
deleted file mode 100644
index c0b2444b2e..0000000000
--- a/tensorflow_asr/augmentations/signal_augment.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import glob
-import librosa
-import nlpaug.augmenter.audio as naa
-
-
-class SignalCropping(naa.CropAug):
-    def __init__(self,
-                 zone=(0.2, 0.8),
-                 coverage=0.1,
-                 crop_range=(0.2, 0.8),
-                 crop_factor=2):
-        super(SignalCropping, self).__init__(sampling_rate=None, zone=zone, coverage=coverage,
-                                             crop_range=crop_range, crop_factor=crop_factor,
-                                             duration=None)
-
-
-class SignalLoudness(naa.LoudnessAug):
-    def __init__(self,
-                 zone=(0.2, 0.8),
-                 coverage=1.,
-                 factor=(0.5, 2)):
-        super(SignalLoudness, self).__init__(zone=zone, coverage=coverage, factor=factor)
-
-
-class SignalMask(naa.MaskAug):
-    def __init__(self,
-                 zone=(0.2, 0.8),
-                 coverage=1.,
-                 mask_range=(0.2, 0.8),
-                 mask_factor=2,
-                 mask_with_noise=True):
-        super(SignalMask, self).__init__(sampling_rate=None, zone=zone, coverage=coverage,
-                                         duration=None, mask_range=mask_range,
-                                         mask_factor=mask_factor,
-                                         mask_with_noise=mask_with_noise)
-
-
-class SignalNoise(naa.NoiseAug):
-    def __init__(self,
-                 sample_rate=16000,
-                 zone=(0.2, 0.8),
-                 coverage=1.,
-                 color="random",
-                 noises: str = None):
-        if noises is not None:
-            noises = glob.glob(os.path.join(noises, "**", "*.wav"), recursive=True)
-            noises = [librosa.load(n, sr=sample_rate)[0] for n in noises]
-        super(SignalNoise, self).__init__(zone=zone, coverage=coverage,
-                                          color=color, noises=noises)
-
-
-class SignalPitch(naa.PitchAug):
-    def __init__(self,
-                 zone=(0.2, 0.8),
-                 coverage=1.,
-                 factor=(-10, 10)):
-        super(SignalPitch, self).__init__(None, zone=zone, coverage=coverage,
-                                          duration=None, factor=factor)
-
-
-class SignalShift(naa.ShiftAug):
-    def __init__(self,
-                 sample_rate=16000,
-                 duration=3,
-                 direction="random"):
-        super(SignalShift, self).__init__(sample_rate, duration=duration, direction=direction)
-
-
-class SignalSpeed(naa.SpeedAug):
-    def __init__(self,
-                 zone=(0.2, 0.8),
-                 coverage=1.,
-                 factor=(0.5, 2)):
-        super(SignalSpeed, self).__init__(zone=zone, coverage=coverage,
-                                          duration=None, factor=factor)
-
-
-class SignalVtlp(naa.VtlpAug):
-    def __init__(self,
-                 sample_rate=16000,
-                 zone=(0.2, 0.8),
-                 coverage=0.1,
-                 fhi=4800,
-                 factor=(0.9, 1.1)):
-        super(SignalVtlp, self).__init__(sample_rate, zone=zone, coverage=coverage,
-                                         duration=None, fhi=fhi, factor=factor)
diff --git a/tensorflow_asr/augmentations/spec_augment.py b/tensorflow_asr/augmentations/spec_augment.py
deleted file mode 100755
index 9e1f68726d..0000000000
--- a/tensorflow_asr/augmentations/spec_augment.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Augmentation on spectrogram: http://arxiv.org/abs/1904.08779 """
-
-import numpy as np
-import tensorflow as tf
-
-from nlpaug.flow import Sequential
-from nlpaug.util import Action
-from nlpaug.model.spectrogram import Spectrogram
-from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
-
-from ..utils.utils import shape_list
-
-# ---------------------------- FREQ MASKING ----------------------------
-
-
-class FreqMaskingModel(Spectrogram):
-    def __init__(self, mask_factor: int = 27):
-        """
-        Args:
-            freq_mask_param: parameter F of frequency masking
-        """
-        super(FreqMaskingModel, self).__init__()
-        self.mask_factor = mask_factor
-
-    def mask(self, data: np.ndarray) -> np.ndarray:
-        """
-        Masking the frequency channels (make features on some channel 0)
-        Args:
-            spectrogram: shape (T, num_feature_bins, V)
-        Returns:
-            frequency masked spectrogram
-        """
-        spectrogram = data.copy()
-        freq = np.random.randint(0, self.mask_factor + 1)
-        freq = min(freq, spectrogram.shape[1])
-        freq0 = np.random.randint(0, spectrogram.shape[1] - freq + 1)
-        spectrogram[:, freq0:freq0 + freq, :] = 0  # masking
-        return spectrogram
-
-
-class FreqMaskingAugmenter(SpectrogramAugmenter):
-    def __init__(self,
-                 mask_factor: float = 27,
-                 name: str = "FreqMaskingAugmenter",
-                 verbose=0):
-        super(FreqMaskingAugmenter, self).__init__(
-            action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose,
-            coverage=1., factor=(40, 80), silence=False, stateless=True)
-        self.model = FreqMaskingModel(mask_factor)
-
-    def substitute(self, data):
-        return self.model.mask(data)
-
-
-class FreqMasking(SpectrogramAugmenter):
-    def __init__(self,
-                 num_masks: int = 1,
-                 mask_factor: float = 27,
-                 name: str = "FreqMasking",
-                 verbose=0):
-        super(FreqMasking, self).__init__(
-            action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose,
-            coverage=1., factor=(40, 80), silence=False, stateless=True)
-        self.flow = Sequential([FreqMaskingAugmenter(mask_factor) for _ in range(num_masks)])
-
-    def substitute(self, data):
-        return self.flow.augment(data)
-
-
-class TFFreqMasking:
-    def __init__(self, num_masks: int = 1, mask_factor: float = 27):
-        self.num_masks = num_masks
-        self.mask_factor = mask_factor
-
-    @tf.function
-    def augment(self, spectrogram: tf.Tensor):
-        """
-        Masking the frequency channels (shape[1])
-        Args:
-            spectrogram: shape (T, num_feature_bins, V)
-        Returns:
-            frequency masked spectrogram
-        """
-        T, F, V = shape_list(spectrogram, out_type=tf.int32)
-        for _ in range(self.num_masks):
-            f = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32)
-            f = tf.minimum(f, F)
-            f0 = tf.random.uniform([], minval=0, maxval=(F - f), dtype=tf.int32)
-            mask = tf.concat([
-                tf.ones([T, f0, V], dtype=spectrogram.dtype),
-                tf.zeros([T, f, V], dtype=spectrogram.dtype),
-                tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype)
-            ], axis=1)
-            spectrogram = spectrogram * mask
-        return spectrogram
-
-
-# ---------------------------- TIME MASKING ----------------------------
-
-
-class TimeMaskingModel(Spectrogram):
-    def __init__(self, mask_factor: float = 100, p_upperbound: float = 1.0):
-        """
-        Args:
-            time_mask_param: parameter W of time masking
-            p_upperbound: an upperbound so that the number of masked time
-                steps must not exceed p_upperbound * total_time_steps
-        """
-        super(TimeMaskingModel, self).__init__()
-        self.mask_factor = mask_factor
-        self.p_upperbound = p_upperbound
-        assert 0.0 <= self.p_upperbound <= 1.0, "0.0 <= p_upperbound <= 1.0"
-
-    def mask(self, data: np.ndarray) -> np.ndarray:
-        """
-        Masking the time steps (make features on some time steps 0)
-        Args:
-            spectrogram: shape (T, num_feature_bins, V)
-        Returns:
-            a tensor that's applied time masking
-        """
-        spectrogram = data.copy()
-        time = np.random.randint(0, self.mask_factor + 1)
-        time = min(time, int(self.p_upperbound * spectrogram.shape[0]))
-        time0 = np.random.randint(0, spectrogram.shape[0] - time + 1)
-        spectrogram[time0:time0 + time, :, :] = 0
-        return spectrogram
-
-
-class TimeMaskingAugmenter(SpectrogramAugmenter):
-    def __init__(self,
-                 mask_factor: float = 100,
-                 p_upperbound: float = 1,
-                 name: str = "TimeMaskingAugmenter",
-                 verbose=0):
-        super(TimeMaskingAugmenter, self).__init__(
-            action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose,
-            coverage=1., silence=False, stateless=True)
-        self.model = TimeMaskingModel(mask_factor, p_upperbound)
-
-    def substitute(self, data):
-        return self.model.mask(data)
-
-
-class TimeMasking(SpectrogramAugmenter):
-    def __init__(self,
-                 num_masks: int = 1,
-                 mask_factor: float = 100,
-                 p_upperbound: float = 1,
-                 name: str = "TimeMasking",
-                 verbose=0):
-        super(TimeMasking, self).__init__(
-            action=Action.SUBSTITUTE, zone=(0.2, 0.8), name=name, device="cpu", verbose=verbose,
-            coverage=1., silence=False, stateless=True)
-        self.flow = Sequential([
-            TimeMaskingAugmenter(mask_factor, p_upperbound) for _ in range(num_masks)
-        ])
-
-    def substitute(self, data):
-        return self.flow.augment(data)
-
-
-class TFTimeMasking:
-    def __init__(self, num_masks: int = 1, mask_factor: float = 100, p_upperbound: float = 1.0):
-        self.num_masks = num_masks
-        self.mask_factor = mask_factor
-        self.p_upperbound = p_upperbound
-
-    @tf.function
-    def augment(self, spectrogram: tf.Tensor):
-        """
-        Masking the time channel (shape[0])
-        Args:
-            spectrogram: shape (T, num_feature_bins, V)
-        Returns:
-            frequency masked spectrogram
-        """
-        T, F, V = shape_list(spectrogram, out_type=tf.int32)
-        for _ in range(self.num_masks):
-            t = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32)
-            t = tf.minimum(t, tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound, dtype=tf.int32))
-            t0 = tf.random.uniform([], minval=0, maxval=(T - t), dtype=tf.int32)
-            mask = tf.concat([
-                tf.ones([t0, F, V], dtype=spectrogram.dtype),
-                tf.zeros([t, F, V], dtype=spectrogram.dtype),
-                tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype)
-            ], axis=0)
-            spectrogram = spectrogram * mask
-        return spectrogram
diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py
index 7c3dcf6e5d..da79ddd1f0 100644
--- a/tensorflow_asr/configs/config.py
+++ b/tensorflow_asr/configs/config.py
@@ -14,7 +14,7 @@
 
 from . import load_yaml
 from ..augmentations.augments import Augmentation
-from ..utils.utils import preprocess_paths
+from ..utils import file_util
 
 
 class DecoderConfig:
@@ -25,12 +25,12 @@ def __init__(self, config: dict = None):
         self.norm_score = config.pop("norm_score", True)
         self.lm_config = config.pop("lm_config", {})
 
-        self.vocabulary = preprocess_paths(config.pop("vocabulary", None))
+        self.vocabulary = file_util.preprocess_paths(config.pop("vocabulary", None))
         self.target_vocab_size = config.pop("target_vocab_size", 1024)
         self.max_subword_length = config.pop("max_subword_length", 4)
-        self.output_path_prefix = preprocess_paths(config.pop("output_path_prefix", None))
+        self.output_path_prefix = file_util.preprocess_paths(config.pop("output_path_prefix", None))
         self.model_type = config.pop("model_type", None)
-        self.corpus_files = preprocess_paths(config.pop("corpus_files", []))
+        self.corpus_files = file_util.preprocess_paths(config.pop("corpus_files", []))
         self.max_corpus_chars = config.pop("max_corpus_chars", None)
         self.reserved_tokens = config.pop("reserved_tokens", None)
 
@@ -41,8 +41,8 @@ class DatasetConfig:
     def __init__(self, config: dict = None):
         if not config: config = {}
         self.stage = config.pop("stage", None)
-        self.data_paths = preprocess_paths(config.pop("data_paths", None))
-        self.tfrecords_dir = preprocess_paths(config.pop("tfrecords_dir", None))
+        self.data_paths = file_util.preprocess_paths(config.pop("data_paths", None))
+        self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None))
         self.tfrecords_shards = config.pop("tfrecords_shards", 16)
         self.shuffle = config.pop("shuffle", False)
         self.cache = config.pop("cache", False)
@@ -59,7 +59,7 @@ def __init__(self, config: dict = None):
         self.batch_size = config.pop("batch_size", 1)
         self.accumulation_steps = config.pop("accumulation_steps", 1)
         self.num_epochs = config.pop("num_epochs", 20)
-        self.outdir = preprocess_paths(config.pop("outdir", None))
+        self.outdir = file_util.preprocess_paths(config.pop("outdir", None))
         self.log_interval_steps = config.pop("log_interval_steps", 500)
         self.save_interval_steps = config.pop("save_interval_steps", 500)
         self.eval_interval_steps = config.pop("eval_interval_steps", 1000)
@@ -81,7 +81,7 @@ class Config:
     """ User config class for training, testing or infering """
 
     def __init__(self, path: str):
-        config = load_yaml(preprocess_paths(path))
+        config = load_yaml(file_util.preprocess_paths(path))
         self.speech_config = config.pop("speech_config", {})
         self.decoder_config = config.pop("decoder_config", {})
         self.model_config = config.pop("model_config", {})
diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py
index a8d8045680..f2d08de6ae 100755
--- a/tensorflow_asr/datasets/asr_dataset.py
+++ b/tensorflow_asr/datasets/asr_dataset.py
@@ -18,11 +18,11 @@
 import numpy as np
 import tensorflow as tf
 
-from ..augmentations.augments import Augmentation
+from ..augmentations.augmentation import Augmentation
 from .base_dataset import BaseDataset, BUFFER_SIZE, TFRECORD_SHARDS, AUTOTUNE
 from ..featurizers.speech_featurizers import load_and_convert_to_wav, read_raw_audio, tf_read_raw_audio, SpeechFeaturizer
 from ..featurizers.text_featurizers import TextFeaturizer
-from ..utils.utils import bytestring_feature, get_num_batches, preprocess_paths
+from ..utils import feature_util, file_util, math_util, data_util
 
 
 class ASRDataset(BaseDataset):
@@ -62,7 +62,7 @@ def compute_metadata(self):
 
     def save_metadata(self, metadata_prefix: str = None):
         if metadata_prefix is None: return
-        metadata_path = preprocess_paths(metadata_prefix) + ".metadata.json"
+        metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json"
         if tf.io.gfile.exists(metadata_path):
             with tf.io.gfile.GFile(metadata_path, "r") as f:
                 content = json.loads(f.read())
@@ -79,7 +79,7 @@ def save_metadata(self, metadata_prefix: str = None):
 
     def load_metadata(self, metadata_prefix: str = None):
         if metadata_prefix is None: return
-        metadata_path = preprocess_paths(metadata_prefix) + ".metadata.json"
+        metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json"
         if tf.io.gfile.exists(metadata_path):
             print(f"Loading metadata from {metadata_path} ...")
             with tf.io.gfile.GFile(metadata_path, "r") as f:
@@ -124,11 +124,11 @@ def preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
             def fn(_path: bytes, _audio: bytes, _indices: bytes):
                 signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate)
 
-                signal = self.augmentations.before.augment(signal)
+                signal = self.augmentations.signal_augment(signal)
 
-                features = self.speech_featurizer.extract(signal)
+                features = self.speech_featurizer.extract(signal.numpy())
 
-                features = self.augmentations.after.augment(features)
+                features = self.augmentations.feature_augment(features)
 
                 label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32)
                 label_length = tf.cast(tf.shape(label)[0], tf.int32)
@@ -148,11 +148,11 @@ def tf_preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
         with tf.device("/CPU:0"):
             signal = tf_read_raw_audio(audio, self.speech_featurizer.sample_rate)
 
-            signal = self.augmentations.before.augment(signal)
+            signal = self.augmentations.signal_augment(signal)
 
             features = self.speech_featurizer.tf_extract(signal)
 
-            features = self.augmentations.after.augment(features)
+            features = self.augmentations.feature_augment(features)
 
             label = tf.strings.to_number(tf.strings.split(indices), out_type=tf.int32)
             label_length = tf.cast(tf.shape(label)[0], tf.int32)
@@ -168,12 +168,27 @@ def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
         Returns:
             path, features, input_lengths, labels, label_lengths, pred_inp
         """
-        if self.use_tf: return self.tf_preprocess(path, audio, indices)
-        return self.preprocess(path, audio, indices)
+        if self.use_tf: data = self.tf_preprocess(path, audio, indices)
+        else: data = self.preprocess(path, audio, indices)
+
+        _, features, input_length, label, label_length, prediction, prediction_length = data
+
+        return (
+            data_util.create_inputs(
+                inputs=features,
+                inputs_length=input_length,
+                predictions=prediction,
+                predictions_length=prediction_length
+            ),
+            data_util.create_labels(
+                labels=label,
+                labels_length=label_length
+            )
+        )
 
     # -------------------------------- CREATION -------------------------------------
 
-    def process(self, dataset: tf.data.Dataset, batch_size: int):
+    def process(self, dataset, batch_size):
         dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)
 
         if self.cache:
@@ -189,21 +204,35 @@ def process(self, dataset: tf.data.Dataset, batch_size: int):
         dataset = dataset.padded_batch(
             batch_size=batch_size,
             padded_shapes=(
-                tf.TensorShape([]),
-                tf.TensorShape(self.speech_featurizer.shape),
-                tf.TensorShape([]),
-                tf.TensorShape(self.text_featurizer.shape),
-                tf.TensorShape([]),
-                tf.TensorShape(self.text_featurizer.prepand_shape),
-                tf.TensorShape([]),
+                data_util.create_inputs(
+                    inputs=tf.TensorShape(self.speech_featurizer.shape),
+                    inputs_length=tf.TensorShape([]),
+                    predictions=tf.TensorShape(self.text_featurizer.prepand_shape),
+                    predictions_length=tf.TensorShape([])
+                ),
+                data_util.create_labels(
+                    labels=tf.TensorShape(self.text_featurizer.shape),
+                    labels_length=tf.TensorShape([])
+                ),
+            ),
+            padding_values=(
+                data_util.create_inputs(
+                    inputs= 0.,
+                    inputs_length=0,
+                    predictions=self.text_featurizer.blank,
+                    predictions_length=0
+                ),
+                data_util.create_labels(
+                    labels=self.text_featurizer.blank,
+                    labels_length=0
+                )
             ),
-            padding_values=(None, 0., 0, self.text_featurizer.blank, 0, self.text_featurizer.blank, 0),
-            drop_remainder=self.drop_remainder
+            drop_remainder = self.drop_remainder
         )
 
         # PREFETCH to improve speed of input length
         dataset = dataset.prefetch(AUTOTUNE)
-        self.total_steps = get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder)
+        self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder)
         return dataset
 
     def create(self, batch_size: int):
@@ -254,9 +283,9 @@ def parse(record):
             def fn(path, indices):
                 audio = load_and_convert_to_wav(path.decode("utf-8")).numpy()
                 feature = {
-                    "path": bytestring_feature([path]),
-                    "audio": bytestring_feature([audio]),
-                    "indices": bytestring_feature([indices])
+                    "path": feature_util.bytestring_feature([path]),
+                    "audio": feature_util.bytestring_feature([audio]),
+                    "indices": feature_util.bytestring_feature([indices])
                 }
                 example = tf.train.Example(features=tf.train.Features(feature=feature))
                 return example.SerializeToString()
diff --git a/tensorflow_asr/datasets/keras/__init__.py b/tensorflow_asr/datasets/keras/__init__.py
deleted file mode 100644
index 5aee10fa36..0000000000
--- a/tensorflow_asr/datasets/keras/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .asr_dataset import ASRDatasetKeras, ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-__all__ = ['ASRDatasetKeras', 'ASRTFRecordDatasetKeras', 'ASRSliceDatasetKeras']
diff --git a/tensorflow_asr/datasets/keras/asr_dataset.py b/tensorflow_asr/datasets/keras/asr_dataset.py
deleted file mode 100644
index 448ad4011e..0000000000
--- a/tensorflow_asr/datasets/keras/asr_dataset.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-
-from ..asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset, AUTOTUNE, TFRECORD_SHARDS
-from ..base_dataset import BUFFER_SIZE
-from ...featurizers.speech_featurizers import SpeechFeaturizer
-from ...featurizers.text_featurizers import TextFeaturizer
-from ...utils.utils import get_num_batches
-from ...augmentations.augments import Augmentation
-
-
-class ASRDatasetKeras(ASRDataset):
-    """ Keras Dataset for ASR using Generator """
-
-    def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
-        """
-        Returns:
-            path, features, input_lengths, labels, label_lengths, pred_inp
-        """
-        if self.use_tf: data = self.tf_preprocess(path, audio, indices)
-        else: data = self.preprocess(path, audio, indices)
-
-        _, features, input_length, label, label_length, prediction, prediction_length = data
-
-        return (
-            {
-                "input": features,
-                "input_length": input_length,
-                "prediction": prediction,
-                "prediction_length": prediction_length
-            },
-            {
-                "label": label,
-                "label_length": label_length
-            }
-        )
-
-    def process(self, dataset, batch_size):
-        dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)
-
-        if self.cache:
-            dataset = dataset.cache()
-
-        if self.shuffle:
-            dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True)
-
-        if self.indefinite:
-            dataset = dataset.repeat()
-
-        # PADDED BATCH the dataset
-        dataset = dataset.padded_batch(
-            batch_size=batch_size,
-            padded_shapes=(
-                {
-                    "input": tf.TensorShape(self.speech_featurizer.shape),
-                    "input_length": tf.TensorShape([]),
-                    "prediction": tf.TensorShape(self.text_featurizer.prepand_shape),
-                    "prediction_length": tf.TensorShape([])
-                },
-                {
-                    "label": tf.TensorShape(self.text_featurizer.shape),
-                    "label_length": tf.TensorShape([])
-                },
-            ),
-            padding_values=(
-                {
-                    "input": 0.,
-                    "input_length": 0,
-                    "prediction": self.text_featurizer.blank,
-                    "prediction_length": 0
-                },
-                {
-                    "label": self.text_featurizer.blank,
-                    "label_length": 0
-                }
-            ),
-            drop_remainder=self.drop_remainder
-        )
-
-        # PREFETCH to improve speed of input length
-        dataset = dataset.prefetch(AUTOTUNE)
-        self.total_steps = get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder)
-        return dataset
-
-
-class ASRTFRecordDatasetKeras(ASRDatasetKeras, ASRTFRecordDataset):
-    """ Keras Dataset for ASR using TFRecords """
-
-    def __init__(self,
-                 data_paths: list,
-                 tfrecords_dir: str,
-                 speech_featurizer: SpeechFeaturizer,
-                 text_featurizer: TextFeaturizer,
-                 stage: str,
-                 augmentations: Augmentation = Augmentation(None),
-                 tfrecords_shards: int = TFRECORD_SHARDS,
-                 cache: bool = False,
-                 shuffle: bool = False,
-                 use_tf: bool = False,
-                 indefinite: bool = False,
-                 drop_remainder: bool = True,
-                 buffer_size: int = BUFFER_SIZE,
-                 **kwargs):
-        ASRTFRecordDataset.__init__(
-            self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, tfrecords_dir=tfrecords_dir, augmentations=augmentations, cache=cache, shuffle=shuffle,
-            tfrecords_shards=tfrecords_shards, drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf,
-            indefinite=indefinite
-        )
-        ASRDatasetKeras.__init__(
-            self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
-            drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf,
-            indefinite=indefinite
-        )
-
-    def parse(self, record: tf.Tensor):
-        feature_description = {
-            "path": tf.io.FixedLenFeature([], tf.string),
-            "audio": tf.io.FixedLenFeature([], tf.string),
-            "indices": tf.io.FixedLenFeature([], tf.string)
-        }
-        example = tf.io.parse_single_example(record, feature_description)
-        return ASRDatasetKeras.parse(self, **example)
-
-    def process(self, dataset: tf.data.Dataset, batch_size: int):
-        return ASRDatasetKeras.process(self, dataset, batch_size)
-
-
-class ASRSliceDatasetKeras(ASRDatasetKeras, ASRSliceDataset):
-    """ Keras Dataset for ASR using Slice """
-
-    def __init__(self,
-                 stage: str,
-                 speech_featurizer: SpeechFeaturizer,
-                 text_featurizer: TextFeaturizer,
-                 data_paths: list,
-                 augmentations: Augmentation = Augmentation(None),
-                 cache: bool = False,
-                 shuffle: bool = False,
-                 use_tf: bool = False,
-                 indefinite: bool = False,
-                 drop_remainder: bool = True,
-                 buffer_size: int = BUFFER_SIZE,
-                 **kwargs):
-        ASRSliceDataset.__init__(
-            self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
-            drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf,
-            indefinite=indefinite
-        )
-        ASRDatasetKeras.__init__(
-            self, stage=stage, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-            data_paths=data_paths, augmentations=augmentations, cache=cache, shuffle=shuffle,
-            drop_remainder=drop_remainder, buffer_size=buffer_size, use_tf=use_tf,
-            indefinite=indefinite
-        )
-
-    def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
-        return ASRDatasetKeras.parse(self, path, audio, indices)
-
-    def process(self, dataset: tf.data.Dataset, batch_size: int):
-        return ASRDatasetKeras.process(self, dataset, batch_size)
diff --git a/tensorflow_asr/featurizers/methods/__init__.py b/tensorflow_asr/featurizers/methods/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/featurizers/gammatone.py b/tensorflow_asr/featurizers/methods/gammatone.py
similarity index 100%
rename from tensorflow_asr/featurizers/gammatone.py
rename to tensorflow_asr/featurizers/methods/gammatone.py
diff --git a/tensorflow_asr/featurizers/speech_featurizers.py b/tensorflow_asr/featurizers/speech_featurizers.py
index c6905f9696..0cb133d6df 100755
--- a/tensorflow_asr/featurizers/speech_featurizers.py
+++ b/tensorflow_asr/featurizers/speech_featurizers.py
@@ -23,10 +23,10 @@
 import tensorflow as tf
 import tensorflow_io as tfio
 
-from ..utils.utils import log10, has_tpu
-from .gammatone import fft_weights
+from ..utils import math_util, env_util
+from .methods import gammatone
 
-tpu = has_tpu()
+tpu = env_util.has_tpu()
 
 
 # def tf_resample(signal, rate_in, rate_out):
@@ -398,16 +398,16 @@ def compute_log_mel_spectrogram(self, signal: np.ndarray) -> np.ndarray:
     def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
         S = self.stft(signal)
 
-        gammatone = fft_weights(self.nfft, self.sample_rate,
-                                self.num_feature_bins, width=1.0,
-                                fmin=0, fmax=int(self.sample_rate / 2),
-                                maxlen=(self.nfft / 2 + 1))
+        gtone = gammatone.fft_weights(self.nfft, self.sample_rate,
+                                      self.num_feature_bins, width=1.0,
+                                      fmin=0, fmax=int(self.sample_rate / 2),
+                                      maxlen=(self.nfft / 2 + 1))
 
-        gammatone = gammatone.numpy().astype(np.float32)
+        gtone = gtone.numpy().astype(np.float32)
 
-        gammatone_spectrogram = np.dot(S.T, gammatone)
+        gtone_spectrogram = np.dot(S.T, gtone)
 
-        return self.power_to_db(gammatone_spectrogram)
+        return self.power_to_db(gtone_spectrogram)
 
 
 class TFSpeechFeaturizer(SpeechFeaturizer):
@@ -438,8 +438,8 @@ def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
         else:
             ref_value = np.abs(ref)
 
-        log_spec = 10.0 * log10(tf.maximum(amin, magnitude))
-        log_spec -= 10.0 * log10(tf.maximum(amin, ref_value))
+        log_spec = 10.0 * math_util.log10(tf.maximum(amin, magnitude))
+        log_spec -= 10.0 * math_util.log10(tf.maximum(amin, ref_value))
 
         if top_db is not None:
             if top_db < 0:
@@ -507,11 +507,11 @@ def compute_mfcc(self, signal):
     def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
         S = self.stft(signal)
 
-        gammatone = fft_weights(self.nfft, self.sample_rate,
-                                self.num_feature_bins, width=1.0,
-                                fmin=0, fmax=int(self.sample_rate / 2),
-                                maxlen=(self.nfft / 2 + 1))
+        gtone = gammatone.fft_weights(self.nfft, self.sample_rate,
+                                      self.num_feature_bins, width=1.0,
+                                      fmin=0, fmax=int(self.sample_rate / 2),
+                                      maxlen=(self.nfft / 2 + 1))
 
-        gammatone_spectrogram = tf.tensordot(S, gammatone, 1)
+        gtone_spectrogram = tf.tensordot(S, gtone, 1)
 
-        return self.power_to_db(gammatone_spectrogram)
+        return self.power_to_db(gtone_spectrogram)
diff --git a/tensorflow_asr/featurizers/text_featurizers.py b/tensorflow_asr/featurizers/text_featurizers.py
index a6b5dfbb8a..074a159429 100755
--- a/tensorflow_asr/featurizers/text_featurizers.py
+++ b/tensorflow_asr/featurizers/text_featurizers.py
@@ -23,8 +23,7 @@
 import tensorflow_datasets as tds
 
 from ..configs.config import DecoderConfig
-from ..utils.utils import preprocess_paths
-from . import wordpiece
+from ..utils import file_util
 
 ENGLISH_CHARACTERS = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
                       "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
@@ -251,13 +250,13 @@ def corpus_generator():
     @classmethod
     def load_from_file(cls, decoder_config: dict, filename: str = None):
         dconf = DecoderConfig(decoder_config.copy())
-        filename = dconf.vocabulary if filename is None else preprocess_paths(filename)
+        filename = dconf.vocabulary if filename is None else file_util.preprocess_paths(filename)
         filename_prefix = os.path.splitext(filename)[0]
         subwords = tds.deprecated.text.SubwordTextEncoder.load_from_file(filename_prefix)
         return cls(decoder_config, subwords)
 
     def save_to_file(self, filename: str = None):
-        filename = self.decoder_config.vocabulary if filename is None else preprocess_paths(filename)
+        filename = self.decoder_config.vocabulary if filename is None else file_util.preprocess_paths(filename)
         filename_prefix = os.path.splitext(filename)[0]
         return self.subwords.save_to_file(filename_prefix)
 
@@ -325,114 +324,6 @@ def indices2upoints(self, indices: tf.Tensor) -> tf.Tensor:
             return tf.gather_nd(upoints, tf.where(tf.not_equal(upoints, 0)))
 
 
-class TFSubwordFeaturizer(TextFeaturizer):
-    """
-    Extract text feature based on char-level granularity.
-    By looking up the vocabulary table, each line of transcript will be
-    converted to a sequence of integer indexes.
-    """
-
-    def __init__(self, decoder_config: dict, subwords=None):
-        """
-        decoder_config = {
-            "target_vocab_size": int,
-            "max_subword_length": 4,
-            "max_corpus_chars": None,
-            "reserved_tokens": None,
-            "beam_width": int,
-            "lm_config": {
-                ...
-            }
-        }
-        """
-        super(TFSubwordFeaturizer, self).__init__(decoder_config)
-        self.subwords = self.__load_subwords() if subwords is None else subwords
-        self.blank = 0  # subword treats blank as 0
-        self.num_classes = self.subwords.vocab_size
-
-    def __load_subwords(self):
-        return wordpiece.WordpieceTokenizer(self.decoder_config.vocabulary, token_out_type=tf.int32)
-
-    @classmethod
-    def build_from_corpus(cls, decoder_config: dict, corpus_files: list = None, output_file: str = None):
-        dconf = DecoderConfig(decoder_config.copy())
-        corpus_files = dconf.corpus_files if corpus_files is None or len(corpus_files) == 0 else corpus_files
-        filename = dconf.vocabulary if output_file is None else preprocess_paths(output_file)
-
-        def corpus_generator():
-            for file in corpus_files:
-                with open(file, "r", encoding="utf-8") as f:
-                    lines = f.read().splitlines()
-                    lines = lines[1:]
-                for line in lines:
-                    line = line.split("\t")
-                    yield line[-1]
-
-        wordpiece.build_from_corpus(
-            corpus_generator(),
-            output_file_path=filename,
-            target_vocab_size=dconf.target_vocab_size,
-            max_subword_length=dconf.max_subword_length,
-            max_corpus_chars=dconf.max_corpus_chars,
-            reserved_tokens=dconf.reserved_tokens
-        )
-
-        subwords = wordpiece.WordpieceTokenizer(filename, token_out_type=tf.int32)
-        return cls(decoder_config, subwords)
-
-    @classmethod
-    def load_from_file(cls, decoder_config: dict, filename: str = None):
-        dconf = DecoderConfig(decoder_config.copy())
-        filename = dconf.vocabulary if filename is None else preprocess_paths(filename)
-        subwords = wordpiece.WordpieceTokenizer(filename, token_out_type=tf.int32)
-        return cls(decoder_config, subwords)
-
-    def extract(self, text: tf.Tensor) -> tf.Tensor:
-        """
-        Convert string to a list of integers
-        Args:
-            text: string (sequence of characters)
-
-        Returns:
-            sequence of ints in tf.Tensor
-        """
-        indices = self.subwords.tokenize(text)
-        indices = indices.merge_dims(0, -1)
-        return indices.to_tensor()
-
-    def iextract(self, indices: tf.Tensor) -> tf.Tensor:
-        """
-        Convert list of indices to string
-        Args:
-            indices: tf.Tensor with dim [B, None]
-
-        Returns:
-            transcripts: tf.Tensor of dtype tf.string with dim [B]
-        """
-        with tf.device("/CPU:0"):  # string data is not supported on GPU
-            indices = self.normalize_indices(indices)
-            text = self.subwords.detokenize(indices)
-            return tf.strings.reduce_join(text, separator=" ", axis=-1)
-
-    @tf.function(
-        input_signature=[
-            tf.TensorSpec([None], dtype=tf.int32)
-        ]
-    )
-    def indices2upoints(self, indices: tf.Tensor) -> tf.Tensor:
-        """
-        Transform Predicted Indices to Unicode Code Points (for using tflite)
-        Args:
-            indices: tf.Tensor of Classes in shape [None]
-
-        Returns:
-            unicode code points transcript with dtype tf.int32 and shape [None]
-        """
-        with tf.name_scope("indices2upoints"):
-            text = self.iextract(tf.expand_dims(indices, axis=0))
-            return tf.reshape(text, shape=[-1])
-
-
 class SentencePieceFeaturizer(TextFeaturizer):
     """
     Extract text feature based on sentence piece package.
@@ -512,7 +403,7 @@ def corpus_iterator():
     @classmethod
     def load_from_file(cls, decoder_config: dict, filename: str = None):
         if filename is not None:
-            filename_prefix = os.path.splitext(preprocess_paths(filename))[0]
+            filename_prefix = os.path.splitext(file_util.preprocess_paths(filename))[0]
         else:
             filename_prefix = decoder_config.get("output_path_prefix", None)
         processor = sp.SentencePieceProcessor()
diff --git a/tensorflow_asr/featurizers/wordpiece.py b/tensorflow_asr/featurizers/wordpiece.py
deleted file mode 100644
index 56aa599953..0000000000
--- a/tensorflow_asr/featurizers/wordpiece.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# coding=utf-8
-# Copyright 2021 TF.Text Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Algorithm for learning wordpiece vocabulary."""
-
-import re
-import collections
-from typing import List, Optional
-
-import tensorflow as tf
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import sort_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops.ragged import ragged_string_ops
-from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
-
-import numpy as np
-import tensorflow_text as tft
-
-Params = collections.namedtuple("Params", [
-    "upper_thresh", "lower_thresh", "num_iterations", "max_input_tokens",
-    "max_token_length", "max_unique_chars", "vocab_size", "slack_ratio",
-    "include_joiner_token", "joiner", "reserved_tokens"
-])
-
-
-def extract_char_tokens(word_counts):
-    """Extracts all single-character tokens from word_counts.
-    Args:
-      word_counts: list of (string, int) tuples
-    Returns:
-      set of single-character strings contained within word_counts
-    """
-
-    seen_chars = set()
-    for word, _ in word_counts:
-        for char in word:
-            seen_chars.add(char)
-    return seen_chars
-
-
-def ensure_all_tokens_exist(input_tokens, output_tokens, include_joiner_token,
-                            joiner):
-    """Adds all tokens in input_tokens to output_tokens if not already present.
-    Args:
-      input_tokens: set of strings (tokens) we want to include
-      output_tokens: string to int dictionary mapping token to count
-      include_joiner_token: bool whether to include joiner token
-      joiner: string used to indicate suffixes
-    Returns:
-      string to int dictionary with all tokens in input_tokens included
-    """
-
-    for token in input_tokens:
-        if token not in output_tokens:
-            output_tokens[token] = 1
-
-        if include_joiner_token:
-            joined_token = joiner + token
-            if joined_token not in output_tokens:
-                output_tokens[joined_token] = 1
-
-    return output_tokens
-
-
-def get_split_indices(word, curr_tokens, include_joiner_token, joiner):
-    """Gets indices for valid substrings of word, for iterations > 0.
-    For iterations > 0, rather than considering every possible substring, we only
-    want to consider starting points corresponding to the start of wordpieces in
-    the current vocabulary.
-    Args:
-      word: string we want to split into substrings
-      curr_tokens: string to int dict of tokens in vocab (from previous iteration)
-      include_joiner_token: bool whether to include joiner token
-      joiner: string used to indicate suffixes
-    Returns:
-      list of ints containing valid starting indices for word
-    """
-
-    indices = []
-    start = 0
-    while start < len(word):
-        end = len(word)
-        while end > start:
-            subtoken = word[start:end]
-            # Subtoken includes the joiner token.
-            if include_joiner_token and start > 0:
-                subtoken = joiner + subtoken
-            # If subtoken is part of vocab, "end" is a valid start index.
-            if subtoken in curr_tokens:
-                indices.append(end)
-                break
-            end -= 1
-
-        if end == start:
-            return None
-        start = end
-
-    return indices
-
-
-def get_search_threshs(word_counts, upper_thresh, lower_thresh):
-    """Clips the thresholds for binary search based on current word counts.
-    The upper threshold parameter typically has a large default value that can
-    result in many iterations of unnecessary search. Thus we clip the upper and
-    lower bounds of search to the maximum and the minimum wordcount values.
-    Args:
-      word_counts: list of (string, int) tuples
-      upper_thresh: int, upper threshold for binary search
-      lower_thresh: int, lower threshold for binary search
-    Returns:
-      upper_search: int, clipped upper threshold for binary search
-      lower_search: int, clipped lower threshold for binary search
-    """
-
-    counts = [count for _, count in word_counts]
-    max_count = max(counts)
-    min_count = min(counts)
-
-    if upper_thresh is None:
-        upper_search = max_count
-    else:
-        upper_search = max_count if max_count < upper_thresh else upper_thresh
-
-    if lower_thresh is None:
-        lower_search = min_count
-    else:
-        lower_search = min_count if min_count > lower_thresh else lower_thresh
-
-    return upper_search, lower_search
-
-
-def get_input_words(word_counts, reserved_tokens, max_token_length):
-    """Filters out words that are longer than max_token_length or are reserved.
-    Args:
-      word_counts: list of (string, int) tuples
-      reserved_tokens: list of strings
-      max_token_length: int, maximum length of a token
-    Returns:
-      list of (string, int) tuples of filtered wordcounts
-    """
-
-    all_counts = []
-
-    for word, count in word_counts:
-        if len(word) > max_token_length or word in reserved_tokens:
-            continue
-        all_counts.append((word, count))
-
-    return all_counts
-
-
-def get_allowed_chars(all_counts, max_unique_chars):
-    """Get the top max_unique_chars characters within our wordcounts.
-    We want each character to be in the vocabulary so that we can keep splitting
-    down to the character level if necessary. However, in order not to inflate
-    our vocabulary with rare characters, we only keep the top max_unique_chars
-    characters.
-    Args:
-      all_counts: list of (string, int) tuples
-      max_unique_chars: int, maximum number of unique single-character tokens
-    Returns:
-      set of strings containing top max_unique_chars characters in all_counts
-    """
-
-    char_counts = collections.defaultdict(int)
-
-    for word, count in all_counts:
-        for char in word:
-            char_counts[char] += count
-
-    # Sort by count, then alphabetically.
-    sorted_counts = sorted(sorted(char_counts.items(), key=lambda x: x[0]),
-                           key=lambda x: x[1], reverse=True)
-
-    allowed_chars = set()
-    for i in range(min(len(sorted_counts), max_unique_chars)):
-        allowed_chars.add(sorted_counts[i][0])
-    return allowed_chars
-
-
-def filter_input_words(all_counts, allowed_chars, max_input_tokens):
-    """Filters out words with unallowed chars and limits words to max_input_tokens.
-    Args:
-      all_counts: list of (string, int) tuples
-      allowed_chars: list of single-character strings
-      max_input_tokens: int, maximum number of tokens accepted as input
-    Returns:
-      list of (string, int) tuples of filtered wordcounts
-    """
-    # Ensure that the input is sorted so that if `max_input_tokens` is reached
-    # the least common tokens are dropped.
-    all_counts = sorted(
-        all_counts, key=lambda word_and_count: word_and_count[1], reverse=True)
-    filtered_counts = []
-    for word, count in all_counts:
-        if (max_input_tokens != -1 and
-                len(filtered_counts) >= max_input_tokens):
-            break
-        has_unallowed_chars = False
-        for char in word:
-            if char not in allowed_chars:
-                has_unallowed_chars = True
-                break
-        if has_unallowed_chars:
-            continue
-        filtered_counts.append((word, count))
-
-    return filtered_counts
-
-
-def generate_final_vocabulary(reserved_tokens, char_tokens, curr_tokens):
-    """Generates final vocab given reserved, single-character, and current tokens.
-    Args:
-      reserved_tokens: list of strings (tokens) that must be included in vocab
-      char_tokens: set of single-character strings
-      curr_tokens: string to int dict mapping token to count
-    Returns:
-      list of strings representing final vocabulary
-    """
-
-    sorted_char_tokens = sorted(list(char_tokens))
-    vocab_char_arrays = []
-    vocab_char_arrays.extend(reserved_tokens)
-    vocab_char_arrays.extend(sorted_char_tokens)
-
-    # Sort by count, then alphabetically.
-    sorted_tokens = sorted(sorted(curr_tokens.items(), key=lambda x: x[0]),
-                           key=lambda x: x[1], reverse=True)
-    for token, _ in sorted_tokens:
-        vocab_char_arrays.append(token)
-
-    seen_tokens = set()
-    # Adding unique tokens to list to maintain sorted order.
-    vocab_words = []
-    for word in vocab_char_arrays:
-        if word in seen_tokens:
-            continue
-        seen_tokens.add(word)
-        vocab_words.append(word)
-
-    return vocab_words
-
-
-def learn_with_thresh(word_counts, thresh, params):
-    """Wordpiece learning algorithm to produce a vocab given frequency threshold.
-    Args:
-      word_counts: list of (string, int) tuples
-      thresh: int, frequency threshold for a token to be included in the vocab
-      params: Params namedtuple, parameters for learning
-    Returns:
-      list of strings, vocabulary generated for the given thresh
-    """
-
-    # Set of single-character tokens.
-    char_tokens = extract_char_tokens(word_counts)
-    curr_tokens = ensure_all_tokens_exist(char_tokens, {},
-                                          params.include_joiner_token,
-                                          params.joiner)
-
-    for iteration in range(params.num_iterations):
-        subtokens = [dict() for _ in range(params.max_token_length + 1)]
-        # Populate array with counts of each subtoken.
-        for word, count in word_counts:
-            if iteration == 0:
-                split_indices = range(1, len(word) + 1)
-            else:
-                split_indices = get_split_indices(word, curr_tokens,
-                                                  params.include_joiner_token,
-                                                  params.joiner)
-                if not split_indices:
-                    continue
-
-            start = 0
-            for index in split_indices:
-                for end in range(start + 1, len(word) + 1):
-                    subtoken = word[start:end]
-                    length = len(subtoken)
-                    if params.include_joiner_token and start > 0:
-                        subtoken = params.joiner + subtoken
-                    if subtoken in subtokens[length]:
-                        # Subtoken exists, increment count.
-                        subtokens[length][subtoken] += count
-                    else:
-                        # New subtoken, add to dict.
-                        subtokens[length][subtoken] = count
-                start = index
-
-        next_tokens = {}
-        # Get all tokens that have a count above the threshold.
-        for length in range(params.max_token_length, 0, -1):
-            for token, count in subtokens[length].items():
-                if count >= thresh:
-                    next_tokens[token] = count
-                # Decrement the count of all prefixes.
-                if len(token) > length:  # This token includes the joiner.
-                    joiner_len = len(params.joiner)
-                    for i in range(1 + joiner_len, length + joiner_len):
-                        prefix = token[0:i]
-                        if prefix in subtokens[i - joiner_len]:
-                            subtokens[i - joiner_len][prefix] -= count
-                else:
-                    for i in range(1, length):
-                        prefix = token[0:i]
-                        if prefix in subtokens[i]:
-                            subtokens[i][prefix] -= count
-
-        # Add back single-character tokens.
-        curr_tokens = ensure_all_tokens_exist(char_tokens, next_tokens,
-                                              params.include_joiner_token,
-                                              params.joiner)
-
-    vocab_words = generate_final_vocabulary(params.reserved_tokens, char_tokens,
-                                            curr_tokens)
-
-    return vocab_words
-
-
-def learn_binary_search(word_counts, lower, upper, params):
-    """Performs binary search to find wordcount frequency threshold.
-    Given upper and lower bounds and a list of (word, count) tuples, performs
-    binary search to find the threshold closest to producing a vocabulary
-    of size vocab_size.
-    Args:
-      word_counts: list of (string, int) tuples
-      lower: int, lower bound for binary search
-      upper: int, upper bound for binary search
-      params: Params namedtuple, parameters for learning
-    Returns:
-      list of strings, vocab that is closest to target vocab_size
-    """
-    thresh = (upper + lower) // 2
-    current_vocab = learn_with_thresh(word_counts, thresh, params)
-    current_vocab_size = len(current_vocab)
-
-    # Allow count to be within k% of the target count, where k is slack ratio.
-    slack_count = params.slack_ratio * params.vocab_size
-    if slack_count < 0:
-        slack_count = 0
-
-    is_within_slack = (current_vocab_size <= params.vocab_size) and (
-        params.vocab_size - current_vocab_size <= slack_count)
-
-    # We"ve created a vocab within our goal range (or, ran out of search space).
-    if is_within_slack or lower >= upper or thresh <= 1:
-        return current_vocab
-
-    current_vocab = None
-
-    if current_vocab_size > params.vocab_size:
-        return learn_binary_search(word_counts, thresh + 1, upper, params)
-
-    else:
-        return learn_binary_search(word_counts, lower, thresh - 1, params)
-
-
-def count_words(iterable) -> collections.Counter:
-    """Converts a iterable of arrays of words into a `Counter` of word counts."""
-    counts = collections.Counter()
-    for words in iterable:
-        # Convert a RaggedTensor to a flat/dense Tensor.
-        words = getattr(words, "flat_values", words)
-        # Flatten any dense tensor
-        words = np.reshape(words, [-1])
-        counts.update(words)
-
-    # Decode the words if necessary.
-    example_word = next(iter(counts.keys()))
-    if isinstance(example_word, bytes):
-        counts = collections.Counter(
-            {word.decode("utf-8"): count for word, count in counts.items()})
-
-    return counts
-
-
-def learn(word_counts,
-          vocab_size: int,
-          reserved_tokens: List[str] = [],
-          upper_thresh: Optional[int] = int(1e7),
-          lower_thresh: Optional[int] = 10,
-          num_iterations: int = 4,
-          max_input_tokens: Optional[int] = int(5e6),
-          max_token_length: int = 50,
-          max_unique_chars: int = 1000,
-          slack_ratio: float = 0.05,
-          include_joiner_token: bool = True,
-          joiner: str = "##") -> List[str]:
-    """Takes in wordcounts and returns wordpiece vocabulary.
-    Args:
-      word_counts: (word, count) pairs as a dictionary, or list of tuples.
-      vocab_size: The target vocabulary size. This is the maximum size.
-      reserved_tokens: A list of tokens that must be included in the vocabulary.
-      upper_thresh: Initial upper bound on the token frequency threshold.
-      lower_thresh: Initial lower bound on the token frequency threchold.
-      num_iterations: Number of iterations to run.
-      max_input_tokens: The maximum number of words in the initial vocabulary. The
-        words with the lowest counts are discarded. Use `None` or `-1` for "no
-        maximum".
-      max_token_length: The maximum token length. Counts for longer words are
-        discarded.
-      max_unique_chars: The maximum alphabet size. This prevents rare characters
-        from inflating the vocabulary. Counts for words containing characters
-        ouside of the selected alphabet are discarded.
-      slack_ratio: The maximum deviation acceptable from `vocab_size` for an
-        acceptable vocabulary. The acceptable range of vocabulary sizes is from
-        `vocab_size*(1-slack_ratio)` to `vocab_size`.
-      include_joiner_token: If true, include the `joiner` token in the output
-        vocabulary.
-      joiner: The prefix to include on suffix tokens in the output vocabulary.
-        Usually "##". For example "places" could be tokenized as `["place",
-        "##s"]`.
-    Returns:
-      string, final vocabulary with each word separated by newline
-    """
-    if isinstance(word_counts, dict):
-        word_counts = word_counts.items()
-
-    params = Params(upper_thresh, lower_thresh, num_iterations, max_input_tokens,
-                    max_token_length, max_unique_chars, vocab_size, slack_ratio,
-                    include_joiner_token, joiner, reserved_tokens)
-
-    upper_search, lower_search = get_search_threshs(word_counts,
-                                                    params.upper_thresh,
-                                                    params.lower_thresh)
-    all_counts = get_input_words(word_counts, params.reserved_tokens,
-                                 params.max_token_length)
-    allowed_chars = get_allowed_chars(all_counts, params.max_unique_chars)
-
-    filtered_counts = filter_input_words(all_counts, allowed_chars,
-                                         params.max_input_tokens)
-
-    vocab = learn_binary_search(filtered_counts, lower_search, upper_search,
-                                params)
-
-    return vocab
-
-
-def build_word_counts(corpus_generator):
-    counts = {}
-    for transcript in corpus_generator:
-        words = transcript.split()
-        for word in words:
-            if counts.get(word, None) is None:
-                counts[word] = 0
-            else:
-                counts[word] += 1
-    return counts
-
-
-def build_from_corpus(corpus_generator,
-                      target_vocab_size: int,
-                      output_file_path: str,
-                      max_subword_length: int = 50,
-                      max_corpus_chars: int = None,
-                      reserved_tokens: List[str] = [],
-                      num_iterations: int = 4):
-    word_counts = build_word_counts(corpus_generator)
-    max_corpus_chars = max_corpus_chars or 1e7
-    reserved_tokens = reserved_tokens or []
-    vocab = learn(word_counts, target_vocab_size,
-                  reserved_tokens=reserved_tokens, num_iterations=num_iterations,
-                  max_input_tokens=10000000, max_token_length=max_subword_length, max_unique_chars=max_corpus_chars)
-    with open(output_file_path, "w") as f:
-        for token in vocab: print(token, file=f)
-
-
-class WordpieceTokenizer(tft.WordpieceTokenizer):
-    @property
-    def vocab_size(self):
-        vocab, _ = self._get_vocab_and_ids()
-        return tf.shape(vocab)[0].numpy()
-
-    def _get_vocab_and_ids(self):
-        export = getattr(self._vocab_lookup_table, 'export', None)
-        if export is None:
-            table = getattr(self._vocab_lookup_table, '_table')
-            export = table.export
-
-        vocab, ids = export()  # pylint: disable=protected-access
-
-        # `.export` doesn't set the shapes.
-        vocab = check_ops.ensure_shape(vocab, [
-            None,
-        ])
-        ids = check_ops.ensure_shape(ids, [
-            None,
-        ])
-
-        order = sort_ops.argsort(ids)
-
-        ids = array_ops.gather(ids, order)
-        vocab = array_ops.gather(vocab, order)
-
-        return vocab, ids
-
-    def detokenize(self, token_ids):
-        r"""Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.
-        >>> import pathlib
-        >>> pathlib.Path('vocab.txt').write_text(
-        ...     "a b c ##a ##b ##c".replace(' ', '\n'))
-        >>> wordpiece = text.WordpieceTokenizer('vocab.txt')
-        >>> token_ids = [[0, 4, 5, 2, 5, 5, 5]]
-        >>> wordpiece.detokenize(token_ids)
-        <tf.RaggedTensor [[b'ab', b'cccc']]>
-        The word pieces are joined along the innermost axis to make words. So the
-        result has the same rank as the input, but the innermost axis of the result
-        indexes words instead of word pieces.
-        The shape transformation is: `[..., wordpieces] => [..., words]`
-        When the input shape is `[..., words, wordpieces]` (like the output of
-        `WordpieceTokenizer.tokenize`) the result's shape is `[..., words, 1]`.
-        The additional ragged axis can be removed using `words.merge_dims(-2, -1)`.
-        Note: This method assumes wordpiece IDs are dense on the interval
-        `[0, vocab_size)`.
-        Args:
-          token_ids: A `RaggedTensor` or `Tensor` with an int dtype. Must have
-          `ndims >= 2`
-        Returns:
-          A `RaggedTensor` with dtype `string` and the rank as the input
-          `token_ids`.
-        """
-        # If there are performance issues with this method or problems with lookup
-        # tables using sparse IDs see the notes in b/177610044.
-        vocab, ids = self._get_vocab_and_ids()
-
-        first_is_zero = tf.math.equal(ids[0], 0)
-        steps = ids[1:] - ids[:-1]
-        all_one_step = tf.reduce_all(tf.math.equal(steps, 1))
-
-        check = control_flow_ops.Assert(
-            first_is_zero & all_one_step,
-            data=[('`detokenize` only works with vocabulary tables where the '
-                   'indices are dense on the interval `[0, vocab_size)`')])
-        with ops.control_dependencies([check]):
-            token_ids = tf.math.minimum(
-                token_ids,
-                # Limit the OOV buckets to a single index.
-                tf.cast(array_ops.size(vocab), token_ids.dtype))
-
-        # Add the unknown token at that index.
-        vocab = array_ops.concat([vocab, [self._unknown_token]], axis=0)
-
-        # Lookup the text tokens and join them along the innermost axis.
-        txt_tokens = array_ops.gather(vocab, token_ids)
-
-        # Ensure the input is Ragged.
-        if not isinstance(txt_tokens, RaggedTensor):
-            txt_tokens = RaggedTensor.from_tensor(txt_tokens)
-
-        # Join the tokens along the last axis.
-        words = string_ops.reduce_join_v2(txt_tokens, axis=-1, separator=' ')
-
-        # Collapse " ##" in all strings to make words.
-        words = string_ops.regex_replace(
-            words, ' ' + re.escape(self._suffix_indicator), '')
-
-        # Strip leading and trailing spaces.
-        words = string_ops.regex_replace(words, '^ +| +$', '')
-
-        # Split on spaces so the last axis is "words".
-        words = ragged_string_ops.string_split_v2(words, sep=' ')
-        return words
diff --git a/tensorflow_asr/models/keras/conformer.py b/tensorflow_asr/models/keras/conformer.py
deleted file mode 100644
index d8aa36f7d1..0000000000
--- a/tensorflow_asr/models/keras/conformer.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .transducer import Transducer
-from ..conformer import ConformerEncoder, L2
-
-
-class Conformer(Transducer):
-    def __init__(self,
-                 vocabulary_size: int,
-                 encoder_subsampling: dict,
-                 encoder_positional_encoding: str = "sinusoid",
-                 encoder_dmodel: int = 144,
-                 encoder_num_blocks: int = 16,
-                 encoder_head_size: int = 36,
-                 encoder_num_heads: int = 4,
-                 encoder_mha_type: str = "relmha",
-                 encoder_kernel_size: int = 32,
-                 encoder_depth_multiplier: int = 1,
-                 encoder_fc_factor: float = 0.5,
-                 encoder_dropout: float = 0,
-                 encoder_trainable: bool = True,
-                 prediction_embed_dim: int = 512,
-                 prediction_embed_dropout: int = 0,
-                 prediction_num_rnns: int = 1,
-                 prediction_rnn_units: int = 320,
-                 prediction_rnn_type: str = "lstm",
-                 prediction_rnn_implementation: int = 2,
-                 prediction_layer_norm: bool = True,
-                 prediction_projection_units: int = 0,
-                 prediction_trainable: bool = True,
-                 joint_dim: int = 1024,
-                 joint_activation: str = "tanh",
-                 prejoint_linear: bool = True,
-                 postjoint_linear: bool = False,
-                 joint_mode: str = "add",
-                 joint_trainable: bool = True,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name: str = "conformer",
-                 **kwargs):
-        super(Conformer, self).__init__(
-            encoder=ConformerEncoder(
-                subsampling=encoder_subsampling,
-                positional_encoding=encoder_positional_encoding,
-                dmodel=encoder_dmodel,
-                num_blocks=encoder_num_blocks,
-                head_size=encoder_head_size,
-                num_heads=encoder_num_heads,
-                mha_type=encoder_mha_type,
-                kernel_size=encoder_kernel_size,
-                depth_multiplier=encoder_depth_multiplier,
-                fc_factor=encoder_fc_factor,
-                dropout=encoder_dropout,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                trainable=encoder_trainable,
-                name=f"{name}_encoder"
-            ),
-            vocabulary_size=vocabulary_size,
-            embed_dim=prediction_embed_dim,
-            embed_dropout=prediction_embed_dropout,
-            num_rnns=prediction_num_rnns,
-            rnn_units=prediction_rnn_units,
-            rnn_type=prediction_rnn_type,
-            rnn_implementation=prediction_rnn_implementation,
-            layer_norm=prediction_layer_norm,
-            projection_units=prediction_projection_units,
-            prediction_trainable=prediction_trainable,
-            joint_dim=joint_dim,
-            joint_activation=joint_activation,
-            prejoint_linear=prejoint_linear,
-            postjoint_linear=postjoint_linear,
-            joint_mode=joint_mode,
-            joint_trainable=joint_trainable,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=name,
-            **kwargs
-        )
-        self.dmodel = encoder_dmodel
-        self.time_reduction_factor = self.encoder.conv_subsampling.time_reduction_factor
diff --git a/tensorflow_asr/models/keras/contextnet.py b/tensorflow_asr/models/keras/contextnet.py
deleted file mode 100644
index 7e43cbbbfb..0000000000
--- a/tensorflow_asr/models/keras/contextnet.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List
-import tensorflow as tf
-
-from .transducer import Transducer
-from ..contextnet import ContextNetEncoder, L2
-from ...utils.utils import get_reduced_length
-
-
-class ContextNet(Transducer):
-    def __init__(self,
-                 vocabulary_size: int,
-                 encoder_blocks: List[dict],
-                 encoder_alpha: float = 0.5,
-                 encoder_trainable: bool = True,
-                 prediction_embed_dim: int = 512,
-                 prediction_embed_dropout: int = 0,
-                 prediction_num_rnns: int = 1,
-                 prediction_rnn_units: int = 320,
-                 prediction_rnn_type: str = "lstm",
-                 prediction_rnn_implementation: int = 2,
-                 prediction_layer_norm: bool = True,
-                 prediction_projection_units: int = 0,
-                 prediction_trainable: bool = True,
-                 joint_dim: int = 1024,
-                 joint_activation: str = "tanh",
-                 prejoint_linear: bool = True,
-                 postjoint_linear: bool = False,
-                 joint_mode: str = "add",
-                 joint_trainable: bool = True,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name: str = "contextnet",
-                 **kwargs):
-        super(ContextNet, self).__init__(
-            encoder=ContextNetEncoder(
-                blocks=encoder_blocks,
-                alpha=encoder_alpha,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                trainable=encoder_trainable,
-                name=f"{name}_encoder"
-            ),
-            vocabulary_size=vocabulary_size,
-            embed_dim=prediction_embed_dim,
-            embed_dropout=prediction_embed_dropout,
-            num_rnns=prediction_num_rnns,
-            rnn_units=prediction_rnn_units,
-            rnn_type=prediction_rnn_type,
-            rnn_implementation=prediction_rnn_implementation,
-            layer_norm=prediction_layer_norm,
-            projection_units=prediction_projection_units,
-            prediction_trainable=prediction_trainable,
-            joint_dim=joint_dim,
-            joint_activation=joint_activation,
-            prejoint_linear=prejoint_linear,
-            postjoint_linear=postjoint_linear,
-            joint_mode=joint_mode,
-            joint_trainable=joint_trainable,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=name,
-            **kwargs
-        )
-        self.dmodel = self.encoder.blocks[-1].dmodel
-        self.time_reduction_factor = 1
-        for block in self.encoder.blocks: self.time_reduction_factor *= block.time_reduction_factor
-
-    def call(self, inputs, training=False, **kwargs):
-        enc = self.encoder([inputs["input"], inputs["input_length"]], training=training, **kwargs)
-        pred = self.predict_net([inputs["prediction"], inputs["prediction_length"]], training=training, **kwargs)
-        outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return {
-            "logit": outputs,
-            "logit_length": get_reduced_length(inputs["input_length"], self.time_reduction_factor)
-        }
-
-    def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
-        with tf.name_scope(f"{self.name}_encoder"):
-            input_length = tf.expand_dims(tf.shape(features)[0], axis=0)
-            outputs = tf.expand_dims(features, axis=0)
-            outputs = self.encoder([outputs, input_length], training=False)
-            return tf.squeeze(outputs, axis=0)
-
-    # -------------------------------- GREEDY -------------------------------------
-
-    @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
-        """
-        RNN Transducer Greedy decoding
-        Args:
-            features (tf.Tensor): a batch of padded extracted features
-
-        Returns:
-            tf.Tensor: a batch of decoded transcripts
-        """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
-
-    def recognize_tflite(self, signal, predicted, prediction_states):
-        """
-        Function to convert to tflite using greedy decoding (default streaming mode)
-        Args:
-            signal: tf.Tensor with shape [None] indicating a single audio signal
-            predicted: last predicted character with shape []
-            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
-
-        Return:
-            transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32
-            predicted: last predicted character with shape []
-            encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P]
-            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
-        """
-        features = self.speech_featurizer.tf_extract(signal)
-        encoded = self.encoder_inference(features, tf.shape(features)[0])
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states)
-        transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
-        return transcript, hypothesis.index, hypothesis.states
-
-    def recognize_tflite_with_timestamp(self, signal, predicted, states):
-        features = self.speech_featurizer.tf_extract(signal)
-        encoded = self.encoder_inference(features, tf.shape(features)[0])
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, states)
-        indices = self.text_featurizer.normalize_indices(hypothesis.prediction)
-        upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1))  # [None, max_subword_length]
-
-        num_samples = tf.cast(tf.shape(signal)[0], dtype=tf.float32)
-        total_time_reduction_factor = self.time_reduction_factor * self.speech_featurizer.frame_step
-
-        stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
-        stime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
-
-        etime = tf.range(total_time_reduction_factor, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
-        etime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
-
-        non_blank = tf.where(tf.not_equal(upoints, 0))
-        non_blank_transcript = tf.gather_nd(upoints, non_blank)
-        non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
-        non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
-
-        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, hypothesis.states
-
-    # -------------------------------- BEAM SEARCH -------------------------------------
-
-    @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
-        """
-        RNN Transducer Beam Search
-        Args:
-            features (tf.Tensor): a batch of padded extracted features
-            lm (bool, optional): whether to use language model. Defaults to False.
-
-        Returns:
-            tf.Tensor: a batch of decoded transcripts
-        """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
diff --git a/tensorflow_asr/models/keras/ctc.py b/tensorflow_asr/models/keras/ctc.py
deleted file mode 100644
index a21eff502b..0000000000
--- a/tensorflow_asr/models/keras/ctc.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-from tensorflow.keras import mixed_precision as mxp
-
-from ..ctc import CtcModel as BaseCtcModel
-from ...utils.utils import get_reduced_length
-from ...losses.keras.ctc_losses import CtcLoss
-
-
-class CtcModel(BaseCtcModel):
-    """ Keras CTC Model Warper """
-    @property
-    def metrics(self):
-        return [self.loss_metric]
-
-    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
-        loss = CtcLoss(blank=blank, global_batch_size=global_batch_size)
-        self.use_loss_scale = use_loss_scale
-        if self.use_loss_scale:
-            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
-        self.loss_metric = tf.keras.metrics.Mean(name="ctc_loss", dtype=tf.float32)
-        super(CtcModel, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
-
-    def train_step(self, batch):
-        x, y_true = batch
-        with tf.GradientTape() as tape:
-            logit = self(x["input"], training=True)
-            y_pred = {
-                "logit": logit,
-                "logit_length": get_reduced_length(x["input_length"], self.time_reduction_factor)
-            }
-            loss = self.loss(y_true, y_pred)
-            if self.use_loss_scale:
-                scaled_loss = self.optimizer.get_scaled_loss(loss)
-        if self.use_loss_scale:
-            scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights)
-            gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
-        else:
-            gradients = tape.gradient(loss, self.trainable_weights)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
-
-    def test_step(self, batch):
-        x, y_true = batch
-        logit = self(x["input"], training=False)
-        y_pred = {
-            "logit": logit,
-            "logit_length": get_reduced_length(x["input_length"], self.time_reduction_factor)
-        }
-        loss = self.loss(y_true, y_pred)
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
diff --git a/tensorflow_asr/models/keras/deepspeech2.py b/tensorflow_asr/models/keras/deepspeech2.py
deleted file mode 100644
index 0c685e87c5..0000000000
--- a/tensorflow_asr/models/keras/deepspeech2.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .ctc import CtcModel
-from ..deepspeech2 import ConvModule, RnnModule, FcModule
-
-
-class DeepSpeech2(CtcModel):
-    def __init__(self,
-                 vocabulary_size: int,
-                 conv_type: str = "conv2d",
-                 conv_kernels: list = [[11, 41], [11, 21], [11, 21]],
-                 conv_strides: list = [[2, 2], [1, 2], [1, 2]],
-                 conv_filters: list = [32, 32, 96],
-                 conv_dropout: float = 0.1,
-                 rnn_nlayers: int = 5,
-                 rnn_type: str = "lstm",
-                 rnn_units: int = 1024,
-                 rnn_bidirectional: bool = True,
-                 rnn_rowconv: int = 0,
-                 rnn_dropout: float = 0.1,
-                 fc_nlayers: int = 0,
-                 fc_units: int = 1024,
-                 fc_dropout: float = 0.1,
-                 name: str = "deepspeech2",
-                 **kwargs):
-        super(DeepSpeech2, self).__init__(name=name, **kwargs)
-
-        self.conv_module = ConvModule(
-            conv_type=conv_type,
-            kernels=conv_kernels,
-            strides=conv_strides,
-            filters=conv_filters,
-            dropout=conv_dropout,
-            name=f"{self.name}_conv_module"
-        )
-
-        self.rnn_module = RnnModule(
-            nlayers=rnn_nlayers,
-            rnn_type=rnn_type,
-            units=rnn_units,
-            bidirectional=rnn_bidirectional,
-            rowconv=rnn_rowconv,
-            dropout=rnn_dropout,
-            name=f"{self.name}_rnn_module"
-        )
-
-        self.fc_module = FcModule(
-            nlayers=fc_nlayers,
-            units=fc_units,
-            dropout=fc_dropout,
-            vocabulary_size=vocabulary_size,
-            name=f"{self.name}_fc_module"
-        )
-
-        self.time_reduction_factor = self.conv_module.reduction_factor
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs = self.conv_module(inputs, training=training, **kwargs)
-        outputs = self.rnn_module(outputs, training=training, **kwargs)
-        outputs = self.fc_module(outputs, training=training, **kwargs)
-        return outputs
-
-    def summary(self, line_length=100, **kwargs):
-        self.conv_module.summary(line_length=line_length, **kwargs)
-        self.rnn_module.summary(line_length=line_length, **kwargs)
-        self.fc_module.summary(line_length=line_length, **kwargs)
-        super(DeepSpeech2, self).summary(line_length=line_length, **kwargs)
-
-    def get_config(self):
-        conf = super(DeepSpeech2, self).get_config()
-        conf.update(self.conv_module.get_config())
-        conf.update(self.rnn_module.get_config())
-        conf.update(self.fc_module.get_config())
-        return conf
diff --git a/tensorflow_asr/models/keras/jasper.py b/tensorflow_asr/models/keras/jasper.py
deleted file mode 100644
index b19010d1b0..0000000000
--- a/tensorflow_asr/models/keras/jasper.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-
-from .ctc import CtcModel
-from ..jasper import Reshape, JasperBlock, JasperSubBlock
-
-
-class Jasper(CtcModel):
-    def __init__(self,
-                 vocabulary_size: int,
-                 dense: bool = False,
-                 first_additional_block_channels: int = 256,
-                 first_additional_block_kernels: int = 11,
-                 first_additional_block_strides: int = 2,
-                 first_additional_block_dilation: int = 1,
-                 first_additional_block_dropout: int = 0.2,
-                 nsubblocks: int = 5,
-                 block_channels: list = [256, 384, 512, 640, 768],
-                 block_kernels: list = [11, 13, 17, 21, 25],
-                 block_dropout: list = [0.2, 0.2, 0.2, 0.3, 0.3],
-                 second_additional_block_channels: int = 896,
-                 second_additional_block_kernels: int = 1,
-                 second_additional_block_strides: int = 1,
-                 second_additional_block_dilation: int = 2,
-                 second_additional_block_dropout: int = 0.4,
-                 third_additional_block_channels: int = 1024,
-                 third_additional_block_kernels: int = 1,
-                 third_additional_block_strides: int = 1,
-                 third_additional_block_dilation: int = 1,
-                 third_additional_block_dropout: int = 0.4,
-                 kernel_regularizer=None,
-                 bias_regularizer=None,
-                 name: str = "jasper",
-                 **kwargs):
-        super(Jasper, self).__init__(name=name, **kwargs)
-
-        assert len(block_channels) == len(block_kernels) == len(block_dropout)
-
-        self.reshape = Reshape(name=f"{self.name}_reshape")
-
-        self.first_additional_block = JasperSubBlock(
-            channels=first_additional_block_channels,
-            kernels=first_additional_block_kernels,
-            strides=first_additional_block_strides,
-            dropout=first_additional_block_dropout,
-            dilation=first_additional_block_dilation,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=f"{self.name}_first_block"
-        )
-
-        self.blocks = [
-            JasperBlock(
-                nsubblocks=nsubblocks,
-                channels=block_channels[i],
-                kernels=block_kernels[i],
-                dropout=block_dropout[i],
-                dense=dense,
-                nresiduals=(i + 1) if dense else 1,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                name=f"{self.name}_block_{i}"
-            ) for i in range(len(block_channels))
-        ]
-
-        self.second_additional_block = JasperSubBlock(
-            channels=second_additional_block_channels,
-            kernels=second_additional_block_kernels,
-            strides=second_additional_block_strides,
-            dropout=second_additional_block_dropout,
-            dilation=second_additional_block_dilation,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=f"{self.name}_second_block"
-        )
-
-        self.third_additional_block = JasperSubBlock(
-            channels=third_additional_block_channels,
-            kernels=third_additional_block_kernels,
-            strides=third_additional_block_strides,
-            dropout=third_additional_block_dropout,
-            dilation=third_additional_block_dilation,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=f"{self.name}_third_block"
-        )
-
-        self.last_block = tf.keras.layers.Conv1D(
-            filters=vocabulary_size, kernel_size=1,
-            strides=1, padding="same",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=f"{self.name}_last_block"
-        )
-
-        self.time_reduction_factor = self.first_additional_block.reduction_factor
-        self.time_reduction_factor *= self.second_additional_block.reduction_factor
-        self.time_reduction_factor *= self.third_additional_block.reduction_factor
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs = self.reshape(inputs)
-        outputs = self.first_additional_block(outputs, training=training, **kwargs)
-
-        residuals = []
-        for block in self.blocks:
-            outputs, residuals = block([outputs, residuals], training=training, **kwargs)
-
-        outputs = self.second_additional_block(outputs, training=training, **kwargs)
-        outputs = self.third_additional_block(outputs, training=training, **kwargs)
-        outputs = self.last_block(outputs, training=training, **kwargs)
-        return outputs
-
-    def summary(self, line_length=100, **kwargs):
-        super(Jasper, self).summary(line_length=line_length, **kwargs)
-
-    def get_config(self):
-        conf = self.reshape.get_config()
-        conf.update(self.first_additional_block.get_config())
-        for block in self.blocks:
-            conf.update(block.get_config())
-        conf.update(self.second_additional_block.get_config())
-        conf.update(self.third_additional_block.get_config())
-        conf.update(self.last_block.get_config())
-        return conf
diff --git a/tensorflow_asr/models/keras/streaming_transducer.py b/tensorflow_asr/models/keras/streaming_transducer.py
deleted file mode 100644
index 8ebb81e279..0000000000
--- a/tensorflow_asr/models/keras/streaming_transducer.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tensorflow as tf
-
-
-from .transducer import Transducer
-from ..streaming_transducer import StreamingTransducerEncoder
-from ...utils.utils import shape_list
-
-
-class StreamingTransducer(Transducer):
-    def __init__(self,
-                 vocabulary_size: int,
-                 encoder_reductions: dict = {0: 3, 1: 2},
-                 encoder_dmodel: int = 640,
-                 encoder_nlayers: int = 8,
-                 encoder_rnn_type: str = "lstm",
-                 encoder_rnn_units: int = 2048,
-                 encoder_layer_norm: bool = True,
-                 encoder_trainable: bool = True,
-                 prediction_embed_dim: int = 320,
-                 prediction_embed_dropout: float = 0,
-                 prediction_num_rnns: int = 2,
-                 prediction_rnn_units: int = 2048,
-                 prediction_rnn_type: str = "lstm",
-                 prediction_layer_norm: bool = True,
-                 prediction_projection_units: int = 640,
-                 prediction_trainable: bool = True,
-                 joint_dim: int = 640,
-                 joint_activation: str = "tanh",
-                 prejoint_linear: bool = True,
-                 postjoint_linear: bool = False,
-                 joint_mode: str = "add",
-                 joint_trainable: bool = True,
-                 kernel_regularizer = None,
-                 bias_regularizer = None,
-                 name = "StreamingTransducer",
-                 **kwargs):
-        super(StreamingTransducer, self).__init__(
-            encoder=StreamingTransducerEncoder(
-                reductions=encoder_reductions,
-                dmodel=encoder_dmodel,
-                nlayers=encoder_nlayers,
-                rnn_type=encoder_rnn_type,
-                rnn_units=encoder_rnn_units,
-                layer_norm=encoder_layer_norm,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                trainable=encoder_trainable,
-                name=f"{name}_encoder"
-            ),
-            vocabulary_size=vocabulary_size,
-            embed_dim=prediction_embed_dim,
-            embed_dropout=prediction_embed_dropout,
-            num_rnns=prediction_num_rnns,
-            rnn_units=prediction_rnn_units,
-            rnn_type=prediction_rnn_type,
-            layer_norm=prediction_layer_norm,
-            projection_units=prediction_projection_units,
-            prediction_trainable=prediction_trainable,
-            joint_dim=joint_dim,
-            joint_activation=joint_activation,
-            prejoint_linear=prejoint_linear,
-            postjoint_linear=postjoint_linear,
-            joint_mode=joint_mode,
-            joint_trainable=joint_trainable,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer,
-            name=name, **kwargs
-        )
-        self.time_reduction_factor = self.encoder.time_reduction_factor
-
-    def encoder_inference(self, features: tf.Tensor, states: tf.Tensor):
-        """Infer function for encoder (or encoders)
-
-        Args:
-            features (tf.Tensor): features with shape [T, F, C]
-            states (tf.Tensor): previous states of encoders with shape [num_rnns, 1 or 2, 1, P]
-
-        Returns:
-            tf.Tensor: output of encoders with shape [T, E]
-            tf.Tensor: states of encoders with shape [num_rnns, 1 or 2, 1, P]
-        """
-        with tf.name_scope(f"{self.name}_encoder"):
-            outputs = tf.expand_dims(features, axis=0)
-            outputs, new_states = self.encoder.recognize(outputs, states)
-            return tf.squeeze(outputs, axis=0), new_states
-
-    # -------------------------------- GREEDY -------------------------------------
-
-    @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
-        """
-        RNN Transducer Greedy decoding
-        Args:
-            features (tf.Tensor): a batch of padded extracted features
-
-        Returns:
-            tf.Tensor: a batch of decoded transcripts
-        """
-        batch_size, _, _, _ = shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
-
-    def recognize_tflite(self, signal, predicted, encoder_states, prediction_states):
-        """
-        Function to convert to tflite using greedy decoding (default streaming mode)
-        Args:
-            signal: tf.Tensor with shape [None] indicating a single audio signal
-            predicted: last predicted character with shape []
-            encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P]
-            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
-
-        Return:
-            transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32
-            predicted: last predicted character with shape []
-            encoder_states: lastest encoder states with shape [num_rnns, 1 or 2, 1, P]
-            prediction_states: lastest prediction states with shape [num_rnns, 1 or 2, 1, P]
-        """
-        features = self.speech_featurizer.tf_extract(signal)
-        encoded, new_encoder_states = self.encoder_inference(features, encoder_states)
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states)
-        transcript = self.text_featurizer.indices2upoints(hypothesis.prediction)
-        return transcript, hypothesis.index, new_encoder_states, hypothesis.states
-
-    def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, prediction_states):
-        features = self.speech_featurizer.tf_extract(signal)
-        encoded, new_encoder_states = self.encoder_inference(features, encoder_states)
-        hypothesis = self._perform_greedy(encoded, tf.shape(encoded)[0], predicted, prediction_states)
-        indices = self.text_featurizer.normalize_indices(hypothesis.prediction)
-        upoints = tf.gather_nd(self.text_featurizer.upoints, tf.expand_dims(indices, axis=-1))  # [None, max_subword_length]
-
-        num_samples = tf.cast(tf.shape(signal)[0], dtype=tf.float32)
-        total_time_reduction_factor = self.time_reduction_factor * self.speech_featurizer.frame_step
-
-        stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
-        stime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
-
-        etime = tf.range(total_time_reduction_factor, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
-        etime /= tf.cast(self.speech_featurizer.sample_rate, dtype=tf.float32)
-
-        non_blank = tf.where(tf.not_equal(upoints, 0))
-        non_blank_transcript = tf.gather_nd(upoints, non_blank)
-        non_blank_stime = tf.gather_nd(tf.repeat(tf.expand_dims(stime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
-        non_blank_etime = tf.gather_nd(tf.repeat(tf.expand_dims(etime, axis=-1), tf.shape(upoints)[-1], axis=-1), non_blank)
-
-        return non_blank_transcript, non_blank_stime, non_blank_etime, hypothesis.index, new_encoder_states, hypothesis.states
-
-    # -------------------------------- BEAM SEARCH -------------------------------------
-
-    @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
-        """
-        RNN Transducer Beam Search
-        Args:
-            features (tf.Tensor): a batch of padded extracted features
-            lm (bool, optional): whether to use language model. Defaults to False.
-
-        Returns:
-            tf.Tensor: a batch of decoded transcripts
-        """
-        batch_size, _, _, _ = shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
-
-    # -------------------------------- TFLITE -------------------------------------
-
-    def make_tflite_function(self, timestamp: bool = True):
-        tflite_func = self.recognize_tflite_with_timestamp if timestamp else self.recognize_tflite
-        return tf.function(
-            tflite_func,
-            input_signature=[
-                tf.TensorSpec([None], dtype=tf.float32),
-                tf.TensorSpec([], dtype=tf.int32),
-                tf.TensorSpec(self.encoder.get_initial_state().get_shape(), dtype=tf.float32),
-                tf.TensorSpec(self.predict_net.get_initial_state().get_shape(), dtype=tf.float32)
-            ]
-        )
diff --git a/tensorflow_asr/models/keras/transducer.py b/tensorflow_asr/models/keras/transducer.py
deleted file mode 100644
index 269ad65b90..0000000000
--- a/tensorflow_asr/models/keras/transducer.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" https://arxiv.org/pdf/1811.06621.pdf """
-
-import tensorflow as tf
-from tensorflow.keras import mixed_precision as mxp
-
-from ..transducer import Transducer as BaseTransducer
-from ...utils.utils import get_reduced_length
-from ...losses.keras.rnnt_losses import RnntLoss
-
-
-class Transducer(BaseTransducer):
-    """ Keras Transducer Model Warper """
-    @property
-    def metrics(self):
-        return [self.loss_metric]
-
-    def _build(self, input_shape, prediction_shape=[None], batch_size=None):
-        inputs = tf.keras.Input(shape=input_shape, batch_size=batch_size, dtype=tf.float32)
-        input_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        pred = tf.keras.Input(shape=prediction_shape, batch_size=batch_size, dtype=tf.int32)
-        pred_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
-        self({
-            "input": inputs,
-            "input_length": input_length,
-            "prediction": pred,
-            "prediction_length": pred_length
-        }, training=False)
-
-    def call(self, inputs, training=False, **kwargs):
-        features = inputs["input"]
-        prediction = inputs["prediction"]
-        prediction_length = inputs["prediction_length"]
-        enc = self.encoder(features, training=training, **kwargs)
-        pred = self.predict_net([prediction, prediction_length], training=training, **kwargs)
-        outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return {
-            "logit": outputs,
-            "logit_length": get_reduced_length(inputs["input_length"], self.time_reduction_factor)
-        }
-
-    def compile(self, optimizer, global_batch_size, blank=0, use_loss_scale=False, run_eagerly=None, **kwargs):
-        loss = RnntLoss(blank=blank, global_batch_size=global_batch_size)
-        self.use_loss_scale = use_loss_scale
-        if self.use_loss_scale:
-            optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
-        self.loss_metric = tf.keras.metrics.Mean(name="rnnt_loss", dtype=tf.float32)
-        super(Transducer, self).compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
-
-    def train_step(self, batch):
-        x, y_true = batch
-        with tf.GradientTape() as tape:
-            y_pred = self({
-                "input": x["input"],
-                "input_length": x["input_length"],
-                "prediction": x["prediction"],
-                "prediction_length": x["prediction_length"],
-            }, training=True)
-            loss = self.loss(y_true, y_pred)
-            if self.use_loss_scale:
-                scaled_loss = self.optimizer.get_scaled_loss(loss)
-        if self.use_loss_scale:
-            scaled_gradients = tape.gradient(scaled_loss, self.trainable_weights)
-            gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
-        else:
-            gradients = tape.gradient(loss, self.trainable_weights)
-        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
-
-    def test_step(self, batch):
-        x, y_true = batch
-        y_pred = self({
-            "input": x["input"],
-            "input_length": x["input_length"],
-            "prediction": x["prediction"],
-            "prediction_length": x["prediction_length"],
-        }, training=False)
-        loss = self.loss(y_true, y_pred)
-        self.loss_metric.update_state(loss)
-        return {m.name: m.result() for m in self.metrics}
diff --git a/tensorflow_asr/runners/README.md b/tensorflow_asr/runners/README.md
deleted file mode 100644
index d7eebe3b27..0000000000
--- a/tensorflow_asr/runners/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Runners :wink:
-
-## Trainers
-
-The trainers use `BaseTrainer` for training models. To create a custom trainer, define these following methods:
-
-1. Set metrics (`train_metrics` and `eval_metrics`)
-2. `_train_step` to process batch of data when training
-3. `_eval_step` to process batch of data when validating
-4. `compile` for loading built models, optimizers and any custom variables
-5. Optionally define `save_model_weights` for save latest model weights every checkpoint. 
-
-## Testers
-
-_Testers only run in 1 GPU_ because we have to use `tf.py_function` or `tf.numpy_function` to calculate WER, CER, Semetrics, ...
-
-The testers for **acoustic** models are combined into single class `BaseTester`. Therefore you don't need to define any custom tester for **acoustic** models.
-
-The `BaseTester` do the steps as follows:
-
-1. Load test dataset.
-2. Run testing with `greedy` decoding, `beamsearch` decoding and if you provide an `Scorer` in `TextFeaturizer.scorer`, it will decode `beamsearch_with_lm`, otherwise another `beamsearch`.
-3. The result of `greedy`, `beamsearch` and `beamsearch_with_lm` are written to the `test.tsv` file in the `outdir` configured in `.yml` config file.
-4. Finish testing by reloading whole `test.tsv` and calculate `WER, CER` from it, the results are printed to stdout.
diff --git a/tensorflow_asr/runners/__init__.py b/tensorflow_asr/runners/__init__.py
deleted file mode 100644
index b750f2499e..0000000000
--- a/tensorflow_asr/runners/__init__.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tensorflow as tf
-
-
-def save_from_checkpoint(func,
-                         outdir: str,
-                         max_to_keep: int = 10,
-                         **kwargs):
-    """
-    Function to save models from latest saved checkpoint
-    Args:
-        func: function takes inputs as **kwargs and performs when checkpoint is found
-        outdir: logging directory
-        max_to_keep: number of checkpoints to keep
-        **kwargs: contains built models, optimizers
-    """
-    steps = tf.Variable(0, trainable=False, dtype=tf.int64)  # Step must be int64
-    epochs = tf.Variable(1, trainable=False)
-    checkpoint_dir = os.path.join(outdir, "checkpoints")
-    if not os.path.exists(checkpoint_dir):
-        raise ValueError(f"checkpoint directory not found: {checkpoint_dir}")
-    ckpt = tf.train.Checkpoint(steps=steps, epochs=epochs, **kwargs)
-    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=max_to_keep)
-    if ckpt_manager.latest_checkpoint:
-        ckpt.restore(ckpt_manager.latest_checkpoint)
-        func(**kwargs)
-    else:
-        raise ValueError("no lastest checkpoint found")
diff --git a/tensorflow_asr/runners/base_runners.py b/tensorflow_asr/runners/base_runners.py
deleted file mode 100644
index 696e0718ca..0000000000
--- a/tensorflow_asr/runners/base_runners.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# This implementation is inspired from
-# https://github.com/dathudeptrai/TensorflowTTS/blob/master/tensorflow_tts/trainers/base_trainer.py
-# Copyright 2020 Minh Nguyen (@dathudeptrai) Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import abc
-import os
-from tqdm import tqdm
-from colorama import Fore
-
-import numpy as np
-import tensorflow as tf
-
-from ..configs.config import RunningConfig
-from ..utils.utils import get_num_batches, bytes_to_string, get_reduced_length
-from ..utils.metrics import ErrorRate, wer, cer
-
-
-class BaseRunner(metaclass=abc.ABCMeta):
-    """ Customized runner module for all models """
-
-    def __init__(self, config: RunningConfig):
-        self.config = config
-        # Writers
-        self.writers = {
-            "train": tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", "train")),
-            "eval": tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", "eval"))
-        }
-
-    def add_writer(self, stage: str):
-        self.writers[stage] = tf.summary.create_file_writer(os.path.join(self.config.outdir, "tensorboard", stage))
-
-    def _write_to_tensorboard(self,
-                              list_metrics: dict,
-                              step: any,
-                              stage: str = "train"):
-        """Write variables to tensorboard."""
-        writer = self.writers.get(stage, None)
-
-        if writer is None:
-            raise ValueError(f"Missing writer for stage {stage}")
-
-        with writer.as_default():
-            for key, value in list_metrics.items():
-                if isinstance(value, tf.keras.metrics.Metric):
-                    tf.summary.scalar(key, value.result(), step=step)
-                else:
-                    tf.summary.scalar(key, value, step=step)
-                writer.flush()
-
-
-class BaseTrainer(BaseRunner):
-    """Customized trainer module for all models."""
-
-    def __init__(self,
-                 config: RunningConfig,
-                 strategy: tf.distribute.Strategy = None):
-        # Configurations
-        super(BaseTrainer, self).__init__(config)
-        self.set_strategy(strategy)
-        # Steps and Epochs start from 0
-        # Step must be int64 to use tf.summary
-        self.steps = tf.Variable(0, trainable=False, dtype=tf.int64)
-        self.train_steps_per_epoch = None
-        self.eval_steps_per_epoch = None
-        # Dataset
-        self.train_data_loader = None
-        self.eval_data_loader = None
-
-        with self.strategy.scope():
-            self.set_train_metrics()
-            self.set_eval_metrics()
-
-    @property
-    def total_train_steps(self):
-        if self.train_steps_per_epoch is None: return None
-        return self.config.num_epochs * self.train_steps_per_epoch
-
-    @property
-    def epochs(self):
-        if self.train_steps_per_epoch is None: return 1
-        return (self.steps.numpy() // self.train_steps_per_epoch) + 1
-
-    # -------------------------------- GET SET -------------------------------------
-
-    @abc.abstractmethod
-    def set_train_metrics(self):
-        self.train_metrics = {}
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def set_eval_metrics(self):
-        self.eval_metrics = {}
-        raise NotImplementedError()
-
-    def set_strategy(self, strategy=None):
-        if strategy is None:
-            gpus = tf.config.experimental.list_physical_devices('GPU')
-            self.strategy = tf.distribute.OneDeviceStrategy("/GPU:0") if gpus else \
-                tf.distribute.OneDeviceStrategy("/CPU:0")
-        else:
-            self.strategy = strategy
-
-    def set_train_data_loader(self, train_dataset, train_bs=None, train_acs=None):
-        """ Set train data loader (MUST). """
-        if not train_bs: train_bs = self.config.batch_size
-        self.global_batch_size = train_bs * self.strategy.num_replicas_in_sync
-        self.config.batch_size = train_bs  # Update batch size fed from arguments
-
-        if not train_acs: train_acs = self.config.accumulation_steps
-        self.config.accumulation_steps = train_acs  # update accum steps fed from arguments
-
-        self.train_data = train_dataset.create(self.global_batch_size)
-        self.train_data_loader = self.strategy.experimental_distribute_dataset(self.train_data)
-        if hasattr(self, "accumulation") and train_dataset.total_steps is not None:
-            self.train_steps_per_epoch = train_dataset.total_steps // self.config.accumulation_steps
-        else:
-            self.train_steps_per_epoch = train_dataset.total_steps
-
-    def set_eval_data_loader(self, eval_dataset, eval_bs=None):
-        """ Set eval data loader (MUST).
-        Eval batch might be significantly greater than train batch """
-        if eval_dataset is None:
-            self.eval_data = None
-            self.eval_data_loader = None
-            return
-        if not eval_bs: eval_bs = self.config.batch_size
-        self.eval_data = eval_dataset.create(eval_bs * self.strategy.num_replicas_in_sync)
-        if self.eval_data is None:
-            self.eval_data_loader = None
-        else:
-            self.eval_data_loader = self.strategy.experimental_distribute_dataset(self.eval_data)
-        self.eval_steps_per_epoch = eval_dataset.total_steps
-
-    # -------------------------------- CHECKPOINTS -------------------------------------
-
-    def create_checkpoint_manager(self, max_to_keep=10, **kwargs):
-        """Create checkpoint management."""
-        with self.strategy.scope():
-            self.ckpt = tf.train.Checkpoint(steps=self.steps, **kwargs)
-            checkpoint_dir = os.path.join(self.config.outdir, "checkpoints")
-            if not tf.io.gfile.exists(checkpoint_dir): tf.io.gfile.makedirs(checkpoint_dir)
-            self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, checkpoint_dir, max_to_keep=max_to_keep)
-
-    def save_checkpoint(self):
-        """Save checkpoint."""
-        with self.strategy.scope():
-            self.ckpt_manager.save()
-            self.train_progbar.set_postfix_str("Successfully Saved Checkpoint")
-
-    def load_checkpoint(self):
-        """Load checkpoint."""
-        with self.strategy.scope():
-            if self.ckpt_manager.latest_checkpoint:
-                self.ckpt.restore(self.ckpt_manager.latest_checkpoint)
-
-    def save_model_weights(self):
-        """ Save the latest model's weights at each save_interval_steps """
-        pass
-
-    # -------------------------------- RUNNING -------------------------------------
-
-    def _finished(self):
-        if self.train_steps_per_epoch is None:
-            return False
-        return self.steps.numpy() >= self.total_train_steps
-
-    def run(self):
-        """Run training."""
-        if self.steps.numpy() > 0: tf.print("Resume training ...")
-
-        self.train_progbar = tqdm(
-            initial=self.steps.numpy(), unit="batch", total=self.total_train_steps,
-            position=0, leave=True,
-            bar_format="{desc} |%s{bar:20}%s{r_bar}" % (Fore.GREEN, Fore.RESET),
-            desc="[Train]"
-        )
-
-        while not self._finished():
-            self._train_epoch()
-
-        # save and evaluate when training is done
-        self.save_checkpoint()
-        self.save_model_weights()
-        self.log_train_metrics()
-        self._eval_epoch()
-
-        self.train_progbar.close()
-        print("> Finish training")
-
-    def _train_epoch(self):
-        """Train model one epoch."""
-        train_iterator = iter(self.train_data_loader)
-        train_steps = 0
-        while True:
-            try:
-                self._train_function(train_iterator)  # Run train step
-            except StopIteration:
-                break
-            except tf.errors.OutOfRangeError:
-                break
-            except Exception as e:
-                raise e
-
-            # Update steps
-            self.steps.assign_add(1)
-            self.train_progbar.update(1)
-            train_steps += 1
-
-            # Run save checkpoint
-            self._check_save_interval()
-
-            # Print epoch info
-            self.train_progbar.set_description_str(f"[Train] [Epoch {self.epochs}/{self.config.num_epochs}]")
-
-            # Print train info to progress bar
-            self._print_train_metrics(self.train_progbar)
-
-            # Run logging
-            self._check_log_interval()
-
-            # Run evaluation
-            self._check_eval_interval()
-
-        self.train_steps_per_epoch = train_steps
-        self.train_progbar.total = self.total_train_steps
-        self.train_progbar.refresh()
-
-    @tf.function
-    def _train_function(self, iterator):
-        batch = next(iterator)
-        self.strategy.run(self._train_step, args=(batch,))
-
-    @abc.abstractmethod
-    def _train_step(self, batch):
-        """ One step training. Does not return anything"""
-        raise NotImplementedError()
-
-    def _eval_epoch(self):
-        """One epoch evaluation."""
-        if not self.eval_data_loader: return
-
-        print("\n> Start evaluation ...")
-
-        for metric in self.eval_metrics.keys():
-            self.eval_metrics[metric].reset_states()
-
-        eval_progbar = tqdm(
-            initial=0, total=self.eval_steps_per_epoch, unit="batch",
-            position=0, leave=True,
-            bar_format="{desc} |%s{bar:20}%s{r_bar}" % (Fore.BLUE, Fore.RESET),
-            desc=f"[Eval] [Step {self.steps.numpy()}]"
-        )
-        eval_iterator = iter(self.eval_data_loader)
-        eval_steps = 0
-
-        while True:
-            try:
-                self._eval_function(eval_iterator)  # Run eval step
-            except StopIteration:
-                break
-            except tf.errors.OutOfRangeError:
-                break
-            except Exception as e:
-                raise e
-
-            # Update steps
-            eval_progbar.update(1)
-            eval_steps += 1
-
-            # Print eval info to progress bar
-            self._print_eval_metrics(eval_progbar)
-
-        self.eval_steps_per_epoch = eval_steps
-        eval_progbar.close()
-        # Write to tensorboard
-        self._write_to_tensorboard(self.eval_metrics, self.steps, stage="eval")
-
-        print("> End evaluation ...")
-
-    @tf.function
-    def _eval_function(self, iterator):
-        batch = next(iterator)
-        self.strategy.run(self._eval_step, args=(batch,))
-
-    @abc.abstractmethod
-    def _eval_step(self, batch):
-        """One eval step. Does not return anything"""
-        raise NotImplementedError()
-
-    @abc.abstractmethod
-    def compile(self, *args, **kwargs):
-        """ Function to initialize models and optimizers """
-        raise NotImplementedError()
-
-    def fit(self, train_dataset, eval_dataset=None, train_bs=None, train_acs=None, eval_bs=None):
-        """ Function run start training, including executing "run" func """
-        self.set_train_data_loader(train_dataset, train_bs, train_acs)
-        self.set_eval_data_loader(eval_dataset, eval_bs)
-        self.load_checkpoint()
-        self.run()
-
-    # -------------------------------- LOGGING -------------------------------------
-
-    def log_train_metrics(self):
-        self._write_to_tensorboard(self.train_metrics, self.steps, stage="train")
-        """Reset train metrics after save it to tensorboard."""
-        for metric in self.train_metrics.keys():
-            self.train_metrics[metric].reset_states()
-
-    def _check_log_interval(self):
-        """Save log interval."""
-        if (self.steps.numpy() % self.config.log_interval_steps == 0):
-            self.log_train_metrics()
-
-    def _check_save_interval(self):
-        """Save log interval."""
-        if (self.steps.numpy() % self.config.save_interval_steps == 0):
-            self.save_checkpoint()
-            self.save_model_weights()
-
-    def _check_eval_interval(self):
-        """Save log interval."""
-        if (self.steps.numpy() % self.config.eval_interval_steps == 0):
-            self._eval_epoch()
-
-    # -------------------------------- UTILS -------------------------------------
-
-    def _print_train_metrics(self, progbar):
-        result_dict = {key: str(value.result().numpy()) for key, value in self.train_metrics.items()}
-        progbar.set_postfix(result_dict)
-
-    def _print_eval_metrics(self, progbar):
-        result_dict = {key: str(value.result().numpy()) for key, value in self.eval_metrics.items()}
-        progbar.set_postfix(result_dict)
-
-    # -------------------------------- END -------------------------------------
-
-
-class BaseTester(BaseRunner):
-    """ Customized tester module for all models
-    This tester model will write results to test.tsv file in outdir
-    After writing finished, it will calculate testing metrics
-    """
-
-    def __init__(self,
-                 config: RunningConfig,
-                 output_name: str = "test"):
-        super(BaseTester, self).__init__(config)
-        self.test_data_loader = None
-        self.processed_records = 0
-
-        self.output_file_path = os.path.join(self.config.outdir, f"{output_name}.tsv")
-        self.test_metrics = {
-            "beam_wer": ErrorRate(func=wer, name="test_beam_wer", dtype=tf.float32),
-            "beam_cer": ErrorRate(func=cer, name="test_beam_cer", dtype=tf.float32),
-            "beam_lm_wer": ErrorRate(func=wer, name="test_beam_lm_wer", dtype=tf.float32),
-            "beam_lm_cer": ErrorRate(func=cer, name="test_beam_lm_cer", dtype=tf.float32),
-            "greed_wer": ErrorRate(func=wer, name="test_greed_wer", dtype=tf.float32),
-            "greed_cer": ErrorRate(func=cer, name="test_greed_cer", dtype=tf.float32)
-        }
-
-    def set_output_file(self, batch_size: int = 1):
-        if not batch_size: batch_size = self.config.batch_size
-        if os.path.exists(self.output_file_path):
-            with open(self.output_file_path, "r", encoding="utf-8") as out:
-                self.processed_records = get_num_batches(len(out.read().splitlines()) - 1, batch_size=batch_size,
-                                                         drop_remainders=False)
-        else:
-            with open(self.output_file_path, "w") as out:
-                out.write("PATH\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\tBEAMSEARCHLM\n")
-
-    def set_test_data_loader(self, test_dataset, batch_size=None):
-        """Set train data loader (MUST)."""
-        if not batch_size: batch_size = self.config.batch_size
-        self.test_data_loader = test_dataset.create(batch_size)
-        self.total_steps = test_dataset.total_steps
-
-    # -------------------------------- RUNNING -------------------------------------
-
-    def compile(self, trained_model: tf.keras.Model):
-        """ Set loaded trained model """
-        if not hasattr(trained_model, "speech_featurizer"):
-            raise AttributeError("Please do 'add_featurizers' before testing")
-        self.model = trained_model
-
-    def run(self, test_dataset, batch_size=None):
-        self.set_output_file(batch_size=batch_size)
-        self.set_test_data_loader(test_dataset, batch_size=batch_size)
-        self._test_epoch()
-        self._finish()
-
-    def _test_epoch(self):
-        if self.processed_records > 0:
-            self.test_data_loader = self.test_data_loader.skip(self.processed_records)
-        progbar = tqdm(initial=self.processed_records, total=self.total_steps,
-                       unit="batch", position=0, desc="[Test]")
-        test_iter = iter(self.test_data_loader)
-        while True:
-            try:
-                decoded = self._test_function(test_iter)
-            except StopIteration:
-                break
-            except tf.errors.OutOfRangeError:
-                break
-
-            decoded = [None if d is None else d.numpy() for d in decoded]
-            self._append_to_file(*decoded)
-            progbar.update(1)
-
-        progbar.close()
-
-    @tf.function
-    def _test_function(self, iterator):
-        batch = next(iterator)
-        return self._test_step(batch)
-
-    @tf.function(experimental_relax_shapes=True)
-    def _test_step(self, batch):
-        """
-        One testing step
-        Args:
-            batch: a step fed from test dataset
-
-        Returns:
-            (file_paths, groundtruth, greedy, beamsearch, beamsearch_lm) each has shape [B]
-        """
-        file_paths, features, input_length, labels, _, _, _ = batch
-
-        labels = self.model.text_featurizer.iextract(labels)
-        input_length = get_reduced_length(input_length, self.model.time_reduction_factor)
-        greed_pred = self.model.recognize(features, input_length)
-        beam_pred = beam_lm_pred = None
-        if self.model.text_featurizer.decoder_config.beam_width > 0:
-            beam_pred = self.model.recognize_beam(features, input_length, lm=False)
-        if self.model.text_featurizer.decoder_config.lm_config:
-            beam_lm_pred = self.model.recognize_beam(features, input_length, lm=True)
-
-        return file_paths, labels, greed_pred, beam_pred, beam_lm_pred
-
-    # -------------------------------- UTILS -------------------------------------
-
-    def _finish(self):
-        tf.print("\n> Calculating evaluation metrics ...")
-        with open(self.output_file_path, "r", encoding="utf-8") as out:
-            lines = out.read().splitlines()
-            lines = lines[1:]  # skip header
-
-        for line in lines:
-            line = line.split("\t")
-            labels, greed_pred, beam_pred, beam_lm_pred = line[1], line[2], line[3], line[4]
-            labels = tf.convert_to_tensor([labels], dtype=tf.string)
-            greed_pred = tf.convert_to_tensor([greed_pred], dtype=tf.string)
-            beam_pred = tf.convert_to_tensor([beam_pred], dtype=tf.string)
-            beam_lm_pred = tf.convert_to_tensor([beam_lm_pred], dtype=tf.string)
-            # Update metrics
-            self.test_metrics["greed_wer"].update_state(greed_pred, labels)
-            self.test_metrics["greed_cer"].update_state(greed_pred, labels)
-            self.test_metrics["beam_wer"].update_state(beam_pred, labels)
-            self.test_metrics["beam_cer"].update_state(beam_pred, labels)
-            self.test_metrics["beam_lm_wer"].update_state(beam_lm_pred, labels)
-            self.test_metrics["beam_lm_cer"].update_state(beam_lm_pred, labels)
-
-        tf.print("Test results:")
-        tf.print("G_WER =", self.test_metrics["greed_wer"].result())
-        tf.print("G_CER =", self.test_metrics["greed_cer"].result())
-        tf.print("B_WER =", self.test_metrics["beam_wer"].result())
-        tf.print("B_CER =", self.test_metrics["beam_cer"].result())
-        tf.print("BLM_WER =", self.test_metrics["beam_lm_wer"].result())
-        tf.print("BLM_CER =", self.test_metrics["beam_lm_cer"].result())
-
-    def _append_to_file(self,
-                        file_path: np.ndarray,
-                        groundtruth: np.ndarray,
-                        greedy: np.ndarray,
-                        beamsearch: np.ndarray,
-                        beamsearch_lm: np.ndarray):
-        file_path = bytes_to_string(file_path)
-        groundtruth = bytes_to_string(groundtruth)
-        greedy = bytes_to_string(greedy)
-        beamsearch = bytes_to_string(beamsearch) if beamsearch is not None else ["" for _ in file_path]
-        beamsearch_lm = bytes_to_string(beamsearch_lm) if beamsearch_lm is not None else ["" for _ in file_path]
-        with open(self.output_file_path, "a", encoding="utf-8") as out:
-            for i, path in enumerate(file_path):
-                line = f"{groundtruth[i]}\t{greedy[i]}\t{beamsearch[i]}\t{beamsearch_lm[i]}"
-                out.write(f"{path.strip()}\t{line}\n")
-
-    # -------------------------------- END -------------------------------------
diff --git a/tensorflow_asr/runners/ctc_runners.py b/tensorflow_asr/runners/ctc_runners.py
deleted file mode 100644
index ddc2d01dbd..0000000000
--- a/tensorflow_asr/runners/ctc_runners.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tensorflow as tf
-
-from ..configs.config import RunningConfig
-from ..featurizers.text_featurizers import TextFeaturizer
-from ..losses.ctc_losses import ctc_loss
-from .base_runners import BaseTrainer
-from ..optimizers.accumulation import GradientAccumulation
-from ..utils.utils import get_reduced_length
-
-
-class CTCTrainer(BaseTrainer):
-    """ Trainer for CTC Models """
-
-    def __init__(self,
-                 text_featurizer: TextFeaturizer,
-                 config: RunningConfig,
-                 strategy: tf.distribute.Strategy = None):
-        self.text_featurizer = text_featurizer
-        super(CTCTrainer, self).__init__(config=config, strategy=strategy)
-
-    def set_train_metrics(self):
-        self.train_metrics = {
-            "ctc_loss": tf.keras.metrics.Mean("train_ctc_loss", dtype=tf.float32)
-        }
-
-    def set_eval_metrics(self):
-        self.eval_metrics = {
-            "ctc_loss": tf.keras.metrics.Mean("eval_ctc_loss", dtype=tf.float32),
-        }
-
-    def save_model_weights(self):
-        with self.strategy.scope():
-            self.model.save_weights(os.path.join(self.config.outdir, "latest.h5"))
-
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, _, _ = batch
-
-        with tf.GradientTape() as tape:
-            y_pred = self.model(features, training=True)
-            tape.watch(y_pred)
-            per_train_loss = ctc_loss(
-                y_true=labels, y_pred=y_pred,
-                input_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                label_length=label_length,
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(per_train_loss,
-                                                    global_batch_size=self.global_batch_size)
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
-
-        self.train_metrics["ctc_loss"].update_state(per_train_loss)
-
-    @tf.function(experimental_relax_shapes=True)
-    def _eval_step(self, batch):
-        _, features, input_length, labels, label_length, _, _ = batch
-
-        logits = self.model(features, training=False)
-
-        per_eval_loss = ctc_loss(
-            y_true=labels, y_pred=logits,
-            input_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-            label_length=label_length,
-            blank=self.text_featurizer.blank
-        )
-
-        # Update metrics
-        self.eval_metrics["ctc_loss"].update_state(per_eval_loss)
-
-    def compile(self, model: tf.keras.Model,
-                optimizer: any,
-                max_to_keep: int = 10):
-        with self.strategy.scope():
-            self.model = model
-            self.optimizer = tf.keras.optimizers.get(optimizer)
-        self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
-
-
-class CTCTrainerGA(CTCTrainer):
-    """ Trainer for CTC Models """
-
-    @tf.function
-    def _train_function(self, iterator):
-        for _ in range(self.config.accumulation_steps):
-            batch = next(iterator)
-            self.strategy.run(self._train_step, args=(batch,))
-        self.strategy.run(self._apply_gradients, args=())
-
-    @tf.function
-    def _apply_gradients(self):
-        self.optimizer.apply_gradients(
-            zip(self.accumulation.gradients, self.model.trainable_variables))
-        self.accumulation.reset()
-
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, _, _ = batch
-
-        with tf.GradientTape() as tape:
-            y_pred = self.model(features, training=True)
-            tape.watch(y_pred)
-            per_train_loss = ctc_loss(
-                y_true=labels, y_pred=y_pred,
-                input_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                label_length=label_length,
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(per_train_loss,
-                                                    global_batch_size=self.global_batch_size)
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.accumulation.accumulate(gradients)
-        self.train_metrics["ctc_loss"].update_state(per_train_loss)
-
-    def compile(self, model: tf.keras.Model,
-                optimizer: any,
-                max_to_keep: int = 10):
-        with self.strategy.scope():
-            self.model = model
-            self.optimizer = tf.keras.optimizers.get(optimizer)
-        self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
-        self.accumulation = GradientAccumulation(self.model.trainable_variables)
diff --git a/tensorflow_asr/runners/transducer_runners.py b/tensorflow_asr/runners/transducer_runners.py
deleted file mode 100644
index d8e396be56..0000000000
--- a/tensorflow_asr/runners/transducer_runners.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tensorflow as tf
-
-from ..configs.config import RunningConfig
-from ..optimizers.accumulation import GradientAccumulation
-from .base_runners import BaseTrainer
-from ..losses.rnnt_losses import rnnt_loss
-from ..models.transducer import Transducer
-from ..featurizers.text_featurizers import TextFeaturizer
-from ..utils.utils import get_reduced_length
-
-
-class TransducerTrainer(BaseTrainer):
-    def __init__(self,
-                 config: RunningConfig,
-                 text_featurizer: TextFeaturizer,
-                 strategy: tf.distribute.Strategy = None):
-        self.text_featurizer = text_featurizer
-        super(TransducerTrainer, self).__init__(config, strategy=strategy)
-
-    def set_train_metrics(self):
-        self.train_metrics = {
-            "transducer_loss": tf.keras.metrics.Mean("train_transducer_loss", dtype=tf.float32)
-        }
-
-    def set_eval_metrics(self):
-        self.eval_metrics = {
-            "transducer_loss": tf.keras.metrics.Mean("eval_transducer_loss", dtype=tf.float32)
-        }
-
-    def save_model_weights(self):
-        self.model.save_weights(os.path.join(self.config.outdir, "latest.h5"))
-
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, prediction, prediction_length = batch
-
-        with tf.GradientTape() as tape:
-            logits = self.model([features, input_length, prediction, prediction_length], training=True)
-            tape.watch(logits)
-            per_train_loss = rnnt_loss(
-                logits=logits, labels=labels, label_length=label_length,
-                logit_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(per_train_loss,
-                                                    global_batch_size=self.global_batch_size)
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
-
-        self.train_metrics["transducer_loss"].update_state(per_train_loss)
-
-    @tf.function(experimental_relax_shapes=True)
-    def _eval_step(self, batch):
-        _, features, input_length, labels, label_length, prediction, prediction_length = batch
-
-        logits = self.model([features, input_length, prediction, prediction_length], training=False)
-        eval_loss = rnnt_loss(
-            logits=logits, labels=labels, label_length=label_length,
-            logit_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-            blank=self.text_featurizer.blank
-        )
-
-        self.eval_metrics["transducer_loss"].update_state(eval_loss)
-
-    def compile(self,
-                model: Transducer,
-                optimizer: any,
-                max_to_keep: int = 10):
-        with self.strategy.scope():
-            self.model = model
-            self.optimizer = tf.keras.optimizers.get(optimizer)
-        self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
-
-
-class TransducerTrainerGA(TransducerTrainer):
-    """ Transducer Trainer that uses Gradients Accumulation """
-
-    @tf.function
-    def _train_function(self, iterator):
-        for _ in range(self.config.accumulation_steps):
-            batch = next(iterator)
-            self.strategy.run(self._train_step, args=(batch,))
-        self.strategy.run(self._apply_gradients, args=())
-
-    @tf.function
-    def _apply_gradients(self):
-        self.optimizer.apply_gradients(
-            zip(self.accumulation.gradients, self.model.trainable_variables))
-        self.accumulation.reset()
-
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, prediction, prediction_length = batch
-
-        with tf.GradientTape() as tape:
-            logits = self.model([features, input_length, prediction, prediction_length], training=True)
-            tape.watch(logits)
-            per_train_loss = rnnt_loss(
-                logits=logits, labels=labels, label_length=label_length,
-                logit_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(
-                per_train_loss,
-                global_batch_size=self.global_batch_size
-            )
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.accumulation.accumulate(gradients)
-        self.train_metrics["transducer_loss"].update_state(per_train_loss)
-
-    def compile(self,
-                model: Transducer,
-                optimizer: any,
-                max_to_keep: int = 10):
-        with self.strategy.scope():
-            self.model = model
-            self.optimizer = tf.keras.optimizers.get(optimizer)
-        self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
-        self.accumulation = GradientAccumulation(self.model.trainable_variables)

From 9e39e5b5e601c208442f2103570c8f65b2c062ea Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Wed, 14 Apr 2021 00:13:56 +0700
Subject: [PATCH 04/13] :writing_hand: update models encoders

---
 .../README.md                                 |   0
 .../config.yml                                |   0
 .../test_rnn_transducer.py}                   |   0
 .../test_subword_rnn_transducer.py}           |   0
 .../tflite_rnn_transducer.py}                 |   0
 .../tflite_subword_rnn_transducer.py}         |   0
 .../train_ga_rnn_transducer.py}               |   0
 .../train_ga_subword_rnn_transducer.py}       |   0
 .../train_keras_subword_rnn_transducer.py}    |   0
 .../train_rnn_transducer.py}                  |   0
 .../train_subword_rnn_transducer.py}          |   0
 tensorflow_asr/models/encoders/__init__.py    |   0
 tensorflow_asr/models/encoders/conformer.py   | 363 ++++++++++++++++++
 tensorflow_asr/models/encoders/contextnet.py  | 191 +++++++++
 tensorflow_asr/models/transducer/conformer.py | 349 +----------------
 .../models/transducer/contextnet.py           | 177 +--------
 16 files changed, 557 insertions(+), 523 deletions(-)
 rename examples/{streaming_transducer => rnn_transducer}/README.md (100%)
 mode change 100755 => 100644
 rename examples/{streaming_transducer => rnn_transducer}/config.yml (100%)
 mode change 100755 => 100644
 rename examples/{streaming_transducer/test_streaming_transducer.py => rnn_transducer/test_rnn_transducer.py} (100%)
 mode change 100755 => 100644
 rename examples/{streaming_transducer/test_subword_streaming_transducer.py => rnn_transducer/test_subword_rnn_transducer.py} (100%)
 mode change 100755 => 100644
 rename examples/{streaming_transducer/tflite_streaming_transducer.py => rnn_transducer/tflite_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/tflite_subword_streaming_transducer.py => rnn_transducer/tflite_subword_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/train_ga_streaming_transducer.py => rnn_transducer/train_ga_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/train_ga_subword_streaming_transducer.py => rnn_transducer/train_ga_subword_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/train_keras_subword_streaming_transducer.py => rnn_transducer/train_keras_subword_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/train_streaming_transducer.py => rnn_transducer/train_rnn_transducer.py} (100%)
 rename examples/{streaming_transducer/train_subword_streaming_transducer.py => rnn_transducer/train_subword_rnn_transducer.py} (100%)
 create mode 100644 tensorflow_asr/models/encoders/__init__.py
 create mode 100644 tensorflow_asr/models/encoders/conformer.py
 create mode 100644 tensorflow_asr/models/encoders/contextnet.py

diff --git a/examples/streaming_transducer/README.md b/examples/rnn_transducer/README.md
old mode 100755
new mode 100644
similarity index 100%
rename from examples/streaming_transducer/README.md
rename to examples/rnn_transducer/README.md
diff --git a/examples/streaming_transducer/config.yml b/examples/rnn_transducer/config.yml
old mode 100755
new mode 100644
similarity index 100%
rename from examples/streaming_transducer/config.yml
rename to examples/rnn_transducer/config.yml
diff --git a/examples/streaming_transducer/test_streaming_transducer.py b/examples/rnn_transducer/test_rnn_transducer.py
old mode 100755
new mode 100644
similarity index 100%
rename from examples/streaming_transducer/test_streaming_transducer.py
rename to examples/rnn_transducer/test_rnn_transducer.py
diff --git a/examples/streaming_transducer/test_subword_streaming_transducer.py b/examples/rnn_transducer/test_subword_rnn_transducer.py
old mode 100755
new mode 100644
similarity index 100%
rename from examples/streaming_transducer/test_subword_streaming_transducer.py
rename to examples/rnn_transducer/test_subword_rnn_transducer.py
diff --git a/examples/streaming_transducer/tflite_streaming_transducer.py b/examples/rnn_transducer/tflite_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/tflite_streaming_transducer.py
rename to examples/rnn_transducer/tflite_rnn_transducer.py
diff --git a/examples/streaming_transducer/tflite_subword_streaming_transducer.py b/examples/rnn_transducer/tflite_subword_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/tflite_subword_streaming_transducer.py
rename to examples/rnn_transducer/tflite_subword_rnn_transducer.py
diff --git a/examples/streaming_transducer/train_ga_streaming_transducer.py b/examples/rnn_transducer/train_ga_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/train_ga_streaming_transducer.py
rename to examples/rnn_transducer/train_ga_rnn_transducer.py
diff --git a/examples/streaming_transducer/train_ga_subword_streaming_transducer.py b/examples/rnn_transducer/train_ga_subword_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/train_ga_subword_streaming_transducer.py
rename to examples/rnn_transducer/train_ga_subword_rnn_transducer.py
diff --git a/examples/streaming_transducer/train_keras_subword_streaming_transducer.py b/examples/rnn_transducer/train_keras_subword_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/train_keras_subword_streaming_transducer.py
rename to examples/rnn_transducer/train_keras_subword_rnn_transducer.py
diff --git a/examples/streaming_transducer/train_streaming_transducer.py b/examples/rnn_transducer/train_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/train_streaming_transducer.py
rename to examples/rnn_transducer/train_rnn_transducer.py
diff --git a/examples/streaming_transducer/train_subword_streaming_transducer.py b/examples/rnn_transducer/train_subword_rnn_transducer.py
similarity index 100%
rename from examples/streaming_transducer/train_subword_streaming_transducer.py
rename to examples/rnn_transducer/train_subword_rnn_transducer.py
diff --git a/tensorflow_asr/models/encoders/__init__.py b/tensorflow_asr/models/encoders/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/encoders/conformer.py b/tensorflow_asr/models/encoders/conformer.py
new file mode 100644
index 0000000000..de7b767fdd
--- /dev/null
+++ b/tensorflow_asr/models/encoders/conformer.py
@@ -0,0 +1,363 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+
+from ..activations.glu import GLU
+from ..layers.subsampling import VggSubsampling, Conv2dSubsampling
+from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
+from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
+from ...utils import shape_util
+
+L2 = tf.keras.regularizers.l2(1e-6)
+
+
+class FFModule(tf.keras.layers.Layer):
+    def __init__(self,
+                 input_dim,
+                 dropout=0.0,
+                 fc_factor=0.5,
+                 kernel_regularizer=L2,
+                 bias_regularizer=L2,
+                 name="ff_module",
+                 **kwargs):
+        super(FFModule, self).__init__(name=name, **kwargs)
+        self.fc_factor = fc_factor
+        self.ln = tf.keras.layers.LayerNormalization(
+            name=f"{name}_ln",
+            gamma_regularizer=kernel_regularizer,
+            beta_regularizer=bias_regularizer
+        )
+        self.ffn1 = tf.keras.layers.Dense(
+            4 * input_dim, name=f"{name}_dense_1",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation")
+        self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1")
+        self.ffn2 = tf.keras.layers.Dense(
+            input_dim, name=f"{name}_dense_2",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2")
+        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
+
+    def call(self, inputs, training=False, **kwargs):
+        outputs = self.ln(inputs, training=training)
+        outputs = self.ffn1(outputs, training=training)
+        outputs = self.swish(outputs)
+        outputs = self.do1(outputs, training=training)
+        outputs = self.ffn2(outputs, training=training)
+        outputs = self.do2(outputs, training=training)
+        outputs = self.res_add([inputs, self.fc_factor * outputs])
+        return outputs
+
+    def get_config(self):
+        conf = super(FFModule, self).get_config()
+        conf.update({"fc_factor": self.fc_factor})
+        conf.update(self.ln.get_config())
+        conf.update(self.ffn1.get_config())
+        conf.update(self.swish.get_config())
+        conf.update(self.do1.get_config())
+        conf.update(self.ffn2.get_config())
+        conf.update(self.do2.get_config())
+        conf.update(self.res_add.get_config())
+        return conf
+
+
+class MHSAModule(tf.keras.layers.Layer):
+    def __init__(self,
+                 head_size,
+                 num_heads,
+                 dropout=0.0,
+                 mha_type="relmha",
+                 kernel_regularizer=L2,
+                 bias_regularizer=L2,
+                 name="mhsa_module",
+                 **kwargs):
+        super(MHSAModule, self).__init__(name=name, **kwargs)
+        self.ln = tf.keras.layers.LayerNormalization(
+            name=f"{name}_ln",
+            gamma_regularizer=kernel_regularizer,
+            beta_regularizer=bias_regularizer
+        )
+        if mha_type == "relmha":
+            self.mha = RelPositionMultiHeadAttention(
+                name=f"{name}_mhsa",
+                head_size=head_size, num_heads=num_heads,
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer
+            )
+        elif mha_type == "mha":
+            self.mha = MultiHeadAttention(
+                name=f"{name}_mhsa",
+                head_size=head_size, num_heads=num_heads,
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer
+            )
+        else:
+            raise ValueError("mha_type must be either 'mha' or 'relmha'")
+        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
+        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
+        self.mha_type = mha_type
+
+    def call(self, inputs, training=False, mask=None, **kwargs):
+        inputs, pos = inputs  # pos is positional encoding
+        outputs = self.ln(inputs, training=training)
+        if self.mha_type == "relmha":
+            outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask)
+        else:
+            outputs = outputs + pos
+            outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask)
+        outputs = self.do(outputs, training=training)
+        outputs = self.res_add([inputs, outputs])
+        return outputs
+
+    def get_config(self):
+        conf = super(MHSAModule, self).get_config()
+        conf.update({"mha_type": self.mha_type})
+        conf.update(self.ln.get_config())
+        conf.update(self.mha.get_config())
+        conf.update(self.do.get_config())
+        conf.update(self.res_add.get_config())
+        return conf
+
+
+class ConvModule(tf.keras.layers.Layer):
+    def __init__(self,
+                 input_dim,
+                 kernel_size=32,
+                 dropout=0.0,
+                 depth_multiplier=1,
+                 kernel_regularizer=L2,
+                 bias_regularizer=L2,
+                 name="conv_module",
+                 **kwargs):
+        super(ConvModule, self).__init__(name=name, **kwargs)
+        self.ln = tf.keras.layers.LayerNormalization()
+        self.pw_conv_1 = tf.keras.layers.Conv2D(
+            filters=2 * input_dim, kernel_size=1, strides=1,
+            padding="valid", name=f"{name}_pw_conv_1",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.glu = GLU(name=f"{name}_glu")
+        self.dw_conv = tf.keras.layers.DepthwiseConv2D(
+            kernel_size=(kernel_size, 1), strides=1,
+            padding="same", name=f"{name}_dw_conv",
+            depth_multiplier=depth_multiplier,
+            depthwise_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.bn = tf.keras.layers.BatchNormalization(
+            name=f"{name}_bn",
+            gamma_regularizer=kernel_regularizer,
+            beta_regularizer=bias_regularizer
+        )
+        self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation")
+        self.pw_conv_2 = tf.keras.layers.Conv2D(
+            filters=input_dim, kernel_size=1, strides=1,
+            padding="valid", name=f"{name}_pw_conv_2",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
+        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
+
+    def call(self, inputs, training=False, **kwargs):
+        outputs = self.ln(inputs, training=training)
+        B, T, E = shape_util.shape_list(outputs)
+        outputs = tf.reshape(outputs, [B, T, 1, E])
+        outputs = self.pw_conv_1(outputs, training=training)
+        outputs = self.glu(outputs)
+        outputs = self.dw_conv(outputs, training=training)
+        outputs = self.bn(outputs, training=training)
+        outputs = self.swish(outputs)
+        outputs = self.pw_conv_2(outputs, training=training)
+        outputs = tf.reshape(outputs, [B, T, E])
+        outputs = self.do(outputs, training=training)
+        outputs = self.res_add([inputs, outputs])
+        return outputs
+
+    def get_config(self):
+        conf = super(ConvModule, self).get_config()
+        conf.update(self.ln.get_config())
+        conf.update(self.pw_conv_1.get_config())
+        conf.update(self.glu.get_config())
+        conf.update(self.dw_conv.get_config())
+        conf.update(self.bn.get_config())
+        conf.update(self.swish.get_config())
+        conf.update(self.pw_conv_2.get_config())
+        conf.update(self.do.get_config())
+        conf.update(self.res_add.get_config())
+        return conf
+
+
+class ConformerBlock(tf.keras.layers.Layer):
+    def __init__(self,
+                 input_dim,
+                 dropout=0.0,
+                 fc_factor=0.5,
+                 head_size=36,
+                 num_heads=4,
+                 mha_type="relmha",
+                 kernel_size=32,
+                 depth_multiplier=1,
+                 kernel_regularizer=L2,
+                 bias_regularizer=L2,
+                 name="conformer_block",
+                 **kwargs):
+        super(ConformerBlock, self).__init__(name=name, **kwargs)
+        self.ffm1 = FFModule(
+            input_dim=input_dim, dropout=dropout,
+            fc_factor=fc_factor, name=f"{name}_ff_module_1",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.mhsam = MHSAModule(
+            mha_type=mha_type,
+            head_size=head_size, num_heads=num_heads,
+            dropout=dropout, name=f"{name}_mhsa_module",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.convm = ConvModule(
+            input_dim=input_dim, kernel_size=kernel_size,
+            dropout=dropout, name=f"{name}_conv_module",
+            depth_multiplier=depth_multiplier,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.ffm2 = FFModule(
+            input_dim=input_dim, dropout=dropout,
+            fc_factor=fc_factor, name=f"{name}_ff_module_2",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.ln = tf.keras.layers.LayerNormalization(
+            name=f"{name}_ln",
+            gamma_regularizer=kernel_regularizer,
+            beta_regularizer=kernel_regularizer
+        )
+
+    def call(self, inputs, training=False, mask=None, **kwargs):
+        inputs, pos = inputs  # pos is positional encoding
+        outputs = self.ffm1(inputs, training=training, **kwargs)
+        outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs)
+        outputs = self.convm(outputs, training=training, **kwargs)
+        outputs = self.ffm2(outputs, training=training, **kwargs)
+        outputs = self.ln(outputs, training=training)
+        return outputs
+
+    def get_config(self):
+        conf = super(ConformerBlock, self).get_config()
+        conf.update(self.ffm1.get_config())
+        conf.update(self.mhsam.get_config())
+        conf.update(self.convm.get_config())
+        conf.update(self.ffm2.get_config())
+        conf.update(self.ln.get_config())
+        return conf
+
+
+class ConformerEncoder(tf.keras.Model):
+    def __init__(self,
+                 subsampling,
+                 positional_encoding="sinusoid",
+                 dmodel=144,
+                 num_blocks=16,
+                 mha_type="relmha",
+                 head_size=36,
+                 num_heads=4,
+                 kernel_size=32,
+                 depth_multiplier=1,
+                 fc_factor=0.5,
+                 dropout=0.0,
+                 kernel_regularizer=L2,
+                 bias_regularizer=L2,
+                 name="conformer_encoder",
+                 **kwargs):
+        super(ConformerEncoder, self).__init__(name=name, **kwargs)
+
+        subsampling_name = subsampling.pop("type", "conv2d")
+        if subsampling_name == "vgg":
+            subsampling_class = VggSubsampling
+        elif subsampling_name == "conv2d":
+            subsampling_class = Conv2dSubsampling
+        else:
+            raise ValueError("subsampling must be either  'conv2d' or 'vgg'")
+
+        self.conv_subsampling = subsampling_class(
+            **subsampling, name=f"{name}_subsampling",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+
+        if positional_encoding == "sinusoid":
+            self.pe = PositionalEncoding(name=f"{name}_pe")
+        elif positional_encoding == "sinusoid_v2":
+            self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe")
+        elif positional_encoding == "sinusoid_concat":
+            self.pe = PositionalEncodingConcat(name=f"{name}_pe")
+        elif positional_encoding == "sinusoid_concat_v2":
+            self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe")
+        elif positional_encoding == "subsampling":
+            self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe")
+        else:
+            raise ValueError("positional_encoding must be either 'sinusoid', \
+                'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'")
+
+        self.linear = tf.keras.layers.Dense(
+            dmodel, name=f"{name}_linear",
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer
+        )
+        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
+
+        self.conformer_blocks = []
+        for i in range(num_blocks):
+            conformer_block = ConformerBlock(
+                input_dim=dmodel,
+                dropout=dropout,
+                fc_factor=fc_factor,
+                head_size=head_size,
+                num_heads=num_heads,
+                mha_type=mha_type,
+                kernel_size=kernel_size,
+                depth_multiplier=depth_multiplier,
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer,
+                name=f"{name}_block_{i}"
+            )
+            self.conformer_blocks.append(conformer_block)
+
+    def call(self, inputs, training=False, mask=None, **kwargs):
+        # input with shape [B, T, V1, V2]
+        outputs = self.conv_subsampling(inputs, training=training)
+        outputs = self.linear(outputs, training=training)
+        pe = self.pe(outputs)
+        outputs = self.do(outputs, training=training)
+        for cblock in self.conformer_blocks:
+            outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs)
+        return outputs
+
+    def get_config(self):
+        conf = super(ConformerEncoder, self).get_config()
+        conf.update(self.conv_subsampling.get_config())
+        conf.update(self.linear.get_config())
+        conf.update(self.do.get_config())
+        conf.update(self.pe.get_config())
+        for cblock in self.conformer_blocks:
+            conf.update(cblock.get_config())
+        return conf
diff --git a/tensorflow_asr/models/encoders/contextnet.py b/tensorflow_asr/models/encoders/contextnet.py
new file mode 100644
index 0000000000..5fd9924972
--- /dev/null
+++ b/tensorflow_asr/models/encoders/contextnet.py
@@ -0,0 +1,191 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Ref: https://github.com/iankur/ContextNet """
+
+from typing import List
+import tensorflow as tf
+from ...utils import math_util
+
+L2 = tf.keras.regularizers.l2(1e-6)
+
+
+def get_activation(activation: str = "silu"):
+    activation = activation.lower()
+    if activation in ["silu", "swish"]: return tf.nn.swish
+    elif activation == "relu": return tf.nn.relu
+    elif activation == "linear": return tf.keras.activations.linear
+    else: raise ValueError("activation must be either 'silu', 'swish', 'relu' or 'linear'")
+
+
+class Reshape(tf.keras.layers.Layer):
+    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
+
+
+class ConvModule(tf.keras.layers.Layer):
+    def __init__(self,
+                 kernel_size: int = 3,
+                 strides: int = 1,
+                 filters: int = 256,
+                 activation: str = "silu",
+                 kernel_regularizer = None,
+                 bias_regularizer = None,
+                 **kwargs):
+        super(ConvModule, self).__init__(**kwargs)
+        self.strides = strides
+        self.conv = tf.keras.layers.SeparableConv1D(
+            filters=filters, kernel_size=kernel_size, strides=strides, padding="same",
+            depthwise_regularizer=kernel_regularizer, pointwise_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer, name=f"{self.name}_conv"
+        )
+        self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn")
+        self.activation = get_activation(activation)
+
+    def call(self, inputs, training=False, **kwargs):
+        outputs = self.conv(inputs, training=training)
+        outputs = self.bn(outputs, training=training)
+        outputs = self.activation(outputs)
+        return outputs
+
+
+class SEModule(tf.keras.layers.Layer):
+    def __init__(self,
+                 kernel_size: int = 3,
+                 strides: int = 1,
+                 filters: int = 256,
+                 activation: str = "silu",
+                 kernel_regularizer = None,
+                 bias_regularizer = None,
+                 **kwargs):
+        super(SEModule, self).__init__(**kwargs)
+        self.conv = ConvModule(
+            kernel_size=kernel_size, strides=strides,
+            filters=filters, activation=activation,
+            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+            name=f"{self.name}_conv_module"
+        )
+        self.activation = get_activation(activation)
+        self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1")
+        self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2")
+
+    def call(self, inputs, training=False, **kwargs):
+        features, input_length = inputs
+        outputs = self.conv(features, training=training)
+
+        se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1))
+        se = self.fc1(se, training=training)
+        se = self.activation(se)
+        se = self.fc2(se, training=training)
+        se = self.activation(se)
+        se = tf.nn.sigmoid(se)
+        se = tf.expand_dims(se, axis=1)
+
+        outputs = tf.multiply(outputs, se)
+        return outputs
+
+
+class ConvBlock(tf.keras.layers.Layer):
+    def __init__(self,
+                 nlayers: int = 3,
+                 kernel_size: int = 3,
+                 filters: int = 256,
+                 strides: int = 1,
+                 residual: bool = True,
+                 activation: str = 'silu',
+                 alpha: float = 1.0,
+                 kernel_regularizer = None,
+                 bias_regularizer = None,
+                 **kwargs):
+        super(ConvBlock, self).__init__(**kwargs)
+
+        self.dmodel = filters
+        self.time_reduction_factor = strides
+        filters = int(filters * alpha)
+
+        self.convs = []
+        for i in range(nlayers - 1):
+            self.convs.append(
+                ConvModule(
+                    kernel_size=kernel_size, strides=1,
+                    filters=filters, activation=activation,
+                    kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+                    name=f"{self.name}_conv_module_{i}"
+                )
+            )
+
+        self.last_conv = ConvModule(
+            kernel_size=kernel_size, strides=strides,
+            filters=filters, activation=activation,
+            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+            name=f"{self.name}_conv_module_{nlayers - 1}"
+        )
+
+        self.se = SEModule(
+            kernel_size=kernel_size, strides=1, filters=filters, activation=activation,
+            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+            name=f"{self.name}_se"
+        )
+
+        self.residual = None
+        if residual:
+            self.residual = ConvModule(
+                kernel_size=kernel_size, strides=strides,
+                filters=filters, activation="linear",
+                kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+                name=f"{self.name}_residual"
+            )
+
+        self.activation = get_activation(activation)
+
+    def call(self, inputs, training=False, **kwargs):
+        features, input_length = inputs
+        outputs = features
+        for conv in self.convs:
+            outputs = conv(outputs, training=training)
+        outputs = self.last_conv(outputs, training=training)
+        input_length = math_util.get_reduced_length(input_length, self.last_conv.strides)
+        outputs = self.se([outputs, input_length], training=training)
+        if self.residual is not None:
+            res = self.residual(features, training=training)
+            outputs = tf.add(outputs, res)
+        outputs = self.activation(outputs)
+        return outputs, input_length
+
+
+class ContextNetEncoder(tf.keras.Model):
+    def __init__(self,
+                 blocks: List[dict] = [],
+                 alpha: float = 1.0,
+                 kernel_regularizer = None,
+                 bias_regularizer = None,
+                 **kwargs):
+        super(ContextNetEncoder, self).__init__(**kwargs)
+
+        self.reshape = Reshape(name=f"{self.name}_reshape")
+
+        self.blocks = []
+        for i, config in enumerate(blocks):
+            self.blocks.append(
+                ConvBlock(
+                    **config, alpha=alpha,
+                    kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
+                    name=f"{self.name}_block_{i}"
+                )
+            )
+
+    def call(self, inputs, training=False, **kwargs):
+        outputs, input_length = inputs
+        outputs = self.reshape(outputs)
+        for block in self.blocks:
+            outputs, input_length = block([outputs, input_length], training=training)
+        return outputs
diff --git a/tensorflow_asr/models/transducer/conformer.py b/tensorflow_asr/models/transducer/conformer.py
index f66197d972..b5d151e266 100644
--- a/tensorflow_asr/models/transducer/conformer.py
+++ b/tensorflow_asr/models/transducer/conformer.py
@@ -12,356 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
 
-from ..activations.glu import GLU
+from ..encoders.conformer import ConformerEncoder, L2
 from .transducer import Transducer
-from ..layers.subsampling import VggSubsampling, Conv2dSubsampling
-from ..layers.positional_encoding import PositionalEncoding, PositionalEncodingConcat
-from ..layers.multihead_attention import MultiHeadAttention, RelPositionMultiHeadAttention
-from ...utils import shape_util
-
-L2 = tf.keras.regularizers.l2(1e-6)
-
-
-class FFModule(tf.keras.layers.Layer):
-    def __init__(self,
-                 input_dim,
-                 dropout=0.0,
-                 fc_factor=0.5,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name="ff_module",
-                 **kwargs):
-        super(FFModule, self).__init__(name=name, **kwargs)
-        self.fc_factor = fc_factor
-        self.ln = tf.keras.layers.LayerNormalization(
-            name=f"{name}_ln",
-            gamma_regularizer=kernel_regularizer,
-            beta_regularizer=bias_regularizer
-        )
-        self.ffn1 = tf.keras.layers.Dense(
-            4 * input_dim, name=f"{name}_dense_1",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation")
-        self.do1 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_1")
-        self.ffn2 = tf.keras.layers.Dense(
-            input_dim, name=f"{name}_dense_2",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.do2 = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout_2")
-        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs = self.ln(inputs, training=training)
-        outputs = self.ffn1(outputs, training=training)
-        outputs = self.swish(outputs)
-        outputs = self.do1(outputs, training=training)
-        outputs = self.ffn2(outputs, training=training)
-        outputs = self.do2(outputs, training=training)
-        outputs = self.res_add([inputs, self.fc_factor * outputs])
-        return outputs
-
-    def get_config(self):
-        conf = super(FFModule, self).get_config()
-        conf.update({"fc_factor": self.fc_factor})
-        conf.update(self.ln.get_config())
-        conf.update(self.ffn1.get_config())
-        conf.update(self.swish.get_config())
-        conf.update(self.do1.get_config())
-        conf.update(self.ffn2.get_config())
-        conf.update(self.do2.get_config())
-        conf.update(self.res_add.get_config())
-        return conf
-
-
-class MHSAModule(tf.keras.layers.Layer):
-    def __init__(self,
-                 head_size,
-                 num_heads,
-                 dropout=0.0,
-                 mha_type="relmha",
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name="mhsa_module",
-                 **kwargs):
-        super(MHSAModule, self).__init__(name=name, **kwargs)
-        self.ln = tf.keras.layers.LayerNormalization(
-            name=f"{name}_ln",
-            gamma_regularizer=kernel_regularizer,
-            beta_regularizer=bias_regularizer
-        )
-        if mha_type == "relmha":
-            self.mha = RelPositionMultiHeadAttention(
-                name=f"{name}_mhsa",
-                head_size=head_size, num_heads=num_heads,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer
-            )
-        elif mha_type == "mha":
-            self.mha = MultiHeadAttention(
-                name=f"{name}_mhsa",
-                head_size=head_size, num_heads=num_heads,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer
-            )
-        else:
-            raise ValueError("mha_type must be either 'mha' or 'relmha'")
-        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
-        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
-        self.mha_type = mha_type
-
-    def call(self, inputs, training=False, mask=None, **kwargs):
-        inputs, pos = inputs  # pos is positional encoding
-        outputs = self.ln(inputs, training=training)
-        if self.mha_type == "relmha":
-            outputs = self.mha([outputs, outputs, outputs, pos], training=training, mask=mask)
-        else:
-            outputs = outputs + pos
-            outputs = self.mha([outputs, outputs, outputs], training=training, mask=mask)
-        outputs = self.do(outputs, training=training)
-        outputs = self.res_add([inputs, outputs])
-        return outputs
-
-    def get_config(self):
-        conf = super(MHSAModule, self).get_config()
-        conf.update({"mha_type": self.mha_type})
-        conf.update(self.ln.get_config())
-        conf.update(self.mha.get_config())
-        conf.update(self.do.get_config())
-        conf.update(self.res_add.get_config())
-        return conf
-
-
-class ConvModule(tf.keras.layers.Layer):
-    def __init__(self,
-                 input_dim,
-                 kernel_size=32,
-                 dropout=0.0,
-                 depth_multiplier=1,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name="conv_module",
-                 **kwargs):
-        super(ConvModule, self).__init__(name=name, **kwargs)
-        self.ln = tf.keras.layers.LayerNormalization()
-        self.pw_conv_1 = tf.keras.layers.Conv2D(
-            filters=2 * input_dim, kernel_size=1, strides=1,
-            padding="valid", name=f"{name}_pw_conv_1",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.glu = GLU(name=f"{name}_glu")
-        self.dw_conv = tf.keras.layers.DepthwiseConv2D(
-            kernel_size=(kernel_size, 1), strides=1,
-            padding="same", name=f"{name}_dw_conv",
-            depth_multiplier=depth_multiplier,
-            depthwise_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.bn = tf.keras.layers.BatchNormalization(
-            name=f"{name}_bn",
-            gamma_regularizer=kernel_regularizer,
-            beta_regularizer=bias_regularizer
-        )
-        self.swish = tf.keras.layers.Activation(tf.nn.swish, name=f"{name}_swish_activation")
-        self.pw_conv_2 = tf.keras.layers.Conv2D(
-            filters=input_dim, kernel_size=1, strides=1,
-            padding="valid", name=f"{name}_pw_conv_2",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
-        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs = self.ln(inputs, training=training)
-        B, T, E = shape_util.shape_list(outputs)
-        outputs = tf.reshape(outputs, [B, T, 1, E])
-        outputs = self.pw_conv_1(outputs, training=training)
-        outputs = self.glu(outputs)
-        outputs = self.dw_conv(outputs, training=training)
-        outputs = self.bn(outputs, training=training)
-        outputs = self.swish(outputs)
-        outputs = self.pw_conv_2(outputs, training=training)
-        outputs = tf.reshape(outputs, [B, T, E])
-        outputs = self.do(outputs, training=training)
-        outputs = self.res_add([inputs, outputs])
-        return outputs
-
-    def get_config(self):
-        conf = super(ConvModule, self).get_config()
-        conf.update(self.ln.get_config())
-        conf.update(self.pw_conv_1.get_config())
-        conf.update(self.glu.get_config())
-        conf.update(self.dw_conv.get_config())
-        conf.update(self.bn.get_config())
-        conf.update(self.swish.get_config())
-        conf.update(self.pw_conv_2.get_config())
-        conf.update(self.do.get_config())
-        conf.update(self.res_add.get_config())
-        return conf
-
-
-class ConformerBlock(tf.keras.layers.Layer):
-    def __init__(self,
-                 input_dim,
-                 dropout=0.0,
-                 fc_factor=0.5,
-                 head_size=36,
-                 num_heads=4,
-                 mha_type="relmha",
-                 kernel_size=32,
-                 depth_multiplier=1,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name="conformer_block",
-                 **kwargs):
-        super(ConformerBlock, self).__init__(name=name, **kwargs)
-        self.ffm1 = FFModule(
-            input_dim=input_dim, dropout=dropout,
-            fc_factor=fc_factor, name=f"{name}_ff_module_1",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.mhsam = MHSAModule(
-            mha_type=mha_type,
-            head_size=head_size, num_heads=num_heads,
-            dropout=dropout, name=f"{name}_mhsa_module",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.convm = ConvModule(
-            input_dim=input_dim, kernel_size=kernel_size,
-            dropout=dropout, name=f"{name}_conv_module",
-            depth_multiplier=depth_multiplier,
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.ffm2 = FFModule(
-            input_dim=input_dim, dropout=dropout,
-            fc_factor=fc_factor, name=f"{name}_ff_module_2",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.ln = tf.keras.layers.LayerNormalization(
-            name=f"{name}_ln",
-            gamma_regularizer=kernel_regularizer,
-            beta_regularizer=kernel_regularizer
-        )
-
-    def call(self, inputs, training=False, mask=None, **kwargs):
-        inputs, pos = inputs  # pos is positional encoding
-        outputs = self.ffm1(inputs, training=training, **kwargs)
-        outputs = self.mhsam([outputs, pos], training=training, mask=mask, **kwargs)
-        outputs = self.convm(outputs, training=training, **kwargs)
-        outputs = self.ffm2(outputs, training=training, **kwargs)
-        outputs = self.ln(outputs, training=training)
-        return outputs
-
-    def get_config(self):
-        conf = super(ConformerBlock, self).get_config()
-        conf.update(self.ffm1.get_config())
-        conf.update(self.mhsam.get_config())
-        conf.update(self.convm.get_config())
-        conf.update(self.ffm2.get_config())
-        conf.update(self.ln.get_config())
-        return conf
-
-
-class ConformerEncoder(tf.keras.Model):
-    def __init__(self,
-                 subsampling,
-                 positional_encoding="sinusoid",
-                 dmodel=144,
-                 num_blocks=16,
-                 mha_type="relmha",
-                 head_size=36,
-                 num_heads=4,
-                 kernel_size=32,
-                 depth_multiplier=1,
-                 fc_factor=0.5,
-                 dropout=0.0,
-                 kernel_regularizer=L2,
-                 bias_regularizer=L2,
-                 name="conformer_encoder",
-                 **kwargs):
-        super(ConformerEncoder, self).__init__(name=name, **kwargs)
-
-        subsampling_name = subsampling.pop("type", "conv2d")
-        if subsampling_name == "vgg":
-            subsampling_class = VggSubsampling
-        elif subsampling_name == "conv2d":
-            subsampling_class = Conv2dSubsampling
-        else:
-            raise ValueError("subsampling must be either  'conv2d' or 'vgg'")
-
-        self.conv_subsampling = subsampling_class(
-            **subsampling, name=f"{name}_subsampling",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-
-        if positional_encoding == "sinusoid":
-            self.pe = PositionalEncoding(name=f"{name}_pe")
-        elif positional_encoding == "sinusoid_v2":
-            self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe")
-        elif positional_encoding == "sinusoid_concat":
-            self.pe = PositionalEncodingConcat(name=f"{name}_pe")
-        elif positional_encoding == "sinusoid_concat_v2":
-            self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe")
-        elif positional_encoding == "subsampling":
-            self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe")
-        else:
-            raise ValueError("positional_encoding must be either 'sinusoid', \
-                'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'")
-
-        self.linear = tf.keras.layers.Dense(
-            dmodel, name=f"{name}_linear",
-            kernel_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer
-        )
-        self.do = tf.keras.layers.Dropout(dropout, name=f"{name}_dropout")
-
-        self.conformer_blocks = []
-        for i in range(num_blocks):
-            conformer_block = ConformerBlock(
-                input_dim=dmodel,
-                dropout=dropout,
-                fc_factor=fc_factor,
-                head_size=head_size,
-                num_heads=num_heads,
-                mha_type=mha_type,
-                kernel_size=kernel_size,
-                depth_multiplier=depth_multiplier,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                name=f"{name}_block_{i}"
-            )
-            self.conformer_blocks.append(conformer_block)
-
-    def call(self, inputs, training=False, mask=None, **kwargs):
-        # input with shape [B, T, V1, V2]
-        outputs = self.conv_subsampling(inputs, training=training)
-        outputs = self.linear(outputs, training=training)
-        pe = self.pe(outputs)
-        outputs = self.do(outputs, training=training)
-        for cblock in self.conformer_blocks:
-            outputs = cblock([outputs, pe], training=training, mask=mask, **kwargs)
-        return outputs
-
-    def get_config(self):
-        conf = super(ConformerEncoder, self).get_config()
-        conf.update(self.conv_subsampling.get_config())
-        conf.update(self.linear.get_config())
-        conf.update(self.do.get_config())
-        conf.update(self.pe.get_config())
-        for cblock in self.conformer_blocks:
-            conf.update(cblock.get_config())
-        return conf
 
 
 class Conformer(Transducer):
diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
index dac9e9050d..2f47f100ee 100644
--- a/tensorflow_asr/models/transducer/contextnet.py
+++ b/tensorflow_asr/models/transducer/contextnet.py
@@ -11,185 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Ref: https://github.com/iankur/ContextNet """
 
 from typing import List
 import tensorflow as tf
-from .transducer import Transducer
-from ...utils import math_util
-
-L2 = tf.keras.regularizers.l2(1e-6)
-
-
-def get_activation(activation: str = "silu"):
-    activation = activation.lower()
-    if activation in ["silu", "swish"]: return tf.nn.swish
-    elif activation == "relu": return tf.nn.relu
-    elif activation == "linear": return tf.keras.activations.linear
-    else: raise ValueError("activation must be either 'silu', 'swish', 'relu' or 'linear'")
-
-
-class Reshape(tf.keras.layers.Layer):
-    def call(self, inputs): return math_util.merge_two_last_dims(inputs)
-
-
-class ConvModule(tf.keras.layers.Layer):
-    def __init__(self,
-                 kernel_size: int = 3,
-                 strides: int = 1,
-                 filters: int = 256,
-                 activation: str = "silu",
-                 kernel_regularizer = None,
-                 bias_regularizer = None,
-                 **kwargs):
-        super(ConvModule, self).__init__(**kwargs)
-        self.strides = strides
-        self.conv = tf.keras.layers.SeparableConv1D(
-            filters=filters, kernel_size=kernel_size, strides=strides, padding="same",
-            depthwise_regularizer=kernel_regularizer, pointwise_regularizer=kernel_regularizer,
-            bias_regularizer=bias_regularizer, name=f"{self.name}_conv"
-        )
-        self.bn = tf.keras.layers.BatchNormalization(name=f"{self.name}_bn")
-        self.activation = get_activation(activation)
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs = self.conv(inputs, training=training)
-        outputs = self.bn(outputs, training=training)
-        outputs = self.activation(outputs)
-        return outputs
-
-
-class SEModule(tf.keras.layers.Layer):
-    def __init__(self,
-                 kernel_size: int = 3,
-                 strides: int = 1,
-                 filters: int = 256,
-                 activation: str = "silu",
-                 kernel_regularizer = None,
-                 bias_regularizer = None,
-                 **kwargs):
-        super(SEModule, self).__init__(**kwargs)
-        self.conv = ConvModule(
-            kernel_size=kernel_size, strides=strides,
-            filters=filters, activation=activation,
-            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-            name=f"{self.name}_conv_module"
-        )
-        self.activation = get_activation(activation)
-        self.fc1 = tf.keras.layers.Dense(filters // 8, name=f"{self.name}_fc1")
-        self.fc2 = tf.keras.layers.Dense(filters, name=f"{self.name}_fc2")
-
-    def call(self, inputs, training=False, **kwargs):
-        features, input_length = inputs
-        outputs = self.conv(features, training=training)
-
-        se = tf.divide(tf.reduce_sum(outputs, axis=1), tf.expand_dims(tf.cast(input_length, dtype=outputs.dtype), axis=1))
-        se = self.fc1(se, training=training)
-        se = self.activation(se)
-        se = self.fc2(se, training=training)
-        se = self.activation(se)
-        se = tf.nn.sigmoid(se)
-        se = tf.expand_dims(se, axis=1)
-
-        outputs = tf.multiply(outputs, se)
-        return outputs
 
-
-class ConvBlock(tf.keras.layers.Layer):
-    def __init__(self,
-                 nlayers: int = 3,
-                 kernel_size: int = 3,
-                 filters: int = 256,
-                 strides: int = 1,
-                 residual: bool = True,
-                 activation: str = 'silu',
-                 alpha: float = 1.0,
-                 kernel_regularizer = None,
-                 bias_regularizer = None,
-                 **kwargs):
-        super(ConvBlock, self).__init__(**kwargs)
-
-        self.dmodel = filters
-        self.time_reduction_factor = strides
-        filters = int(filters * alpha)
-
-        self.convs = []
-        for i in range(nlayers - 1):
-            self.convs.append(
-                ConvModule(
-                    kernel_size=kernel_size, strides=1,
-                    filters=filters, activation=activation,
-                    kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-                    name=f"{self.name}_conv_module_{i}"
-                )
-            )
-
-        self.last_conv = ConvModule(
-            kernel_size=kernel_size, strides=strides,
-            filters=filters, activation=activation,
-            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-            name=f"{self.name}_conv_module_{nlayers - 1}"
-        )
-
-        self.se = SEModule(
-            kernel_size=kernel_size, strides=1, filters=filters, activation=activation,
-            kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-            name=f"{self.name}_se"
-        )
-
-        self.residual = None
-        if residual:
-            self.residual = ConvModule(
-                kernel_size=kernel_size, strides=strides,
-                filters=filters, activation="linear",
-                kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-                name=f"{self.name}_residual"
-            )
-
-        self.activation = get_activation(activation)
-
-    def call(self, inputs, training=False, **kwargs):
-        features, input_length = inputs
-        outputs = features
-        for conv in self.convs:
-            outputs = conv(outputs, training=training)
-        outputs = self.last_conv(outputs, training=training)
-        input_length = math_util.get_reduced_length(input_length, self.last_conv.strides)
-        outputs = self.se([outputs, input_length], training=training)
-        if self.residual is not None:
-            res = self.residual(features, training=training)
-            outputs = tf.add(outputs, res)
-        outputs = self.activation(outputs)
-        return outputs, input_length
-
-
-class ContextNetEncoder(tf.keras.Model):
-    def __init__(self,
-                 blocks: List[dict] = [],
-                 alpha: float = 1.0,
-                 kernel_regularizer = None,
-                 bias_regularizer = None,
-                 **kwargs):
-        super(ContextNetEncoder, self).__init__(**kwargs)
-
-        self.reshape = Reshape(name=f"{self.name}_reshape")
-
-        self.blocks = []
-        for i, config in enumerate(blocks):
-            self.blocks.append(
-                ConvBlock(
-                    **config, alpha=alpha,
-                    kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer,
-                    name=f"{self.name}_block_{i}"
-                )
-            )
-
-    def call(self, inputs, training=False, **kwargs):
-        outputs, input_length = inputs
-        outputs = self.reshape(outputs)
-        for block in self.blocks:
-            outputs, input_length = block([outputs, input_length], training=training)
-        return outputs
+from ..encoders.contextnet import ContextNetEncoder, L2
+from .transducer import Transducer
 
 
 class ContextNet(Transducer):

From 29a285911d544f972e2a658997b5b999fa3d6a0f Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Wed, 14 Apr 2021 00:22:27 +0700
Subject: [PATCH 05/13] :writing_hand: update augmentations

---
 examples/conformer/config.yml                | 2 +-
 examples/contextnet/config.yml               | 2 +-
 examples/rnn_transducer/config.yml           | 2 +-
 tensorflow_asr/augmentations/README.md       | 2 +-
 tensorflow_asr/augmentations/augmentation.py | 8 ++++----
 tests/conformer/config.yml                   | 2 +-
 tests/contextnet/config.yml                  | 2 +-
 tests/streaming_transducer/config.yml        | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml
index d77dfa1f24..79bef5276b 100755
--- a/examples/conformer/config.yml
+++ b/examples/conformer/config.yml
@@ -66,7 +66,7 @@ learning_config:
   train_dataset_config:
     use_tf: True
     augmentation_config:
-      after:
+      feature_augment:
         time_masking:
           num_masks: 10
           mask_factor: 100
diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml
index 5127dd1de6..24b2f17e9d 100644
--- a/examples/contextnet/config.yml
+++ b/examples/contextnet/config.yml
@@ -198,7 +198,7 @@ learning_config:
   train_dataset_config:
     use_tf: True
     augmentation_config:
-      after:
+      feature_augment:
         time_masking:
           num_masks: 10
           mask_factor: 100
diff --git a/examples/rnn_transducer/config.yml b/examples/rnn_transducer/config.yml
index 47b0e41ae9..8acfee4f92 100644
--- a/examples/rnn_transducer/config.yml
+++ b/examples/rnn_transducer/config.yml
@@ -55,7 +55,7 @@ learning_config:
   train_dataset_config:
     use_tf: True
     augmentation_config:
-      after:
+      feature_augment:
         time_masking:
           num_masks: 10
           mask_factor: 100
diff --git a/tensorflow_asr/augmentations/README.md b/tensorflow_asr/augmentations/README.md
index 6dc714a967..4723c7659d 100644
--- a/tensorflow_asr/augmentations/README.md
+++ b/tensorflow_asr/augmentations/README.md
@@ -7,7 +7,7 @@ Augmentations use `nlpaug`, for futher information, see [nlpaug.readthedocs.io](
 ```yaml
 augmentations:
     before: ...
-    after: ...
+    feature_augment: ...
 ```
 
 Where `before` and `after` are augmentation methods to use before and after features extraction.
diff --git a/tensorflow_asr/augmentations/augmentation.py b/tensorflow_asr/augmentations/augmentation.py
index 314a6488b6..4ffa03df29 100644
--- a/tensorflow_asr/augmentations/augmentation.py
+++ b/tensorflow_asr/augmentations/augmentation.py
@@ -27,8 +27,8 @@ class Augmentation:
     def __init__(self, config: dict = None):
         if not config: config = {}
         self.prob = float(config.pop("prob", 0.5))
-        self.before = self.parse(config.pop("before", {}))
-        self.after = self.parse(config.pop("after", {}))
+        self.signal_augmentations = self.parse(config.pop("signal_augment", {}))
+        self.feature_augmentations = self.parse(config.pop("feature_augment", {}))
 
     def _augment(self, inputs, augmentations):
         outputs = inputs
@@ -39,11 +39,11 @@ def _augment(self, inputs, augmentations):
 
     @tf.function
     def signal_augment(self, inputs):
-        return self._augment(inputs, self.before)
+        return self._augment(inputs, self.signal_augmentations)
 
     @tf.function
     def feature_augment(self, inputs):
-        return self._augment(inputs, self.after)
+        return self._augment(inputs, self.feature_augmentations)
 
     @staticmethod
     def parse(config: dict) -> list:
diff --git a/tests/conformer/config.yml b/tests/conformer/config.yml
index 3f4bd41415..5e94f0cd41 100644
--- a/tests/conformer/config.yml
+++ b/tests/conformer/config.yml
@@ -62,7 +62,7 @@ model_config:
 
 learning_config:
   augmentations:
-    after:
+    feature_augment:
       time_masking:
         num_masks: 10
         mask_factor: 100
diff --git a/tests/contextnet/config.yml b/tests/contextnet/config.yml
index 7b5d8d2333..a510b710ad 100644
--- a/tests/contextnet/config.yml
+++ b/tests/contextnet/config.yml
@@ -196,7 +196,7 @@ model_config:
 
 learning_config:
   augmentations:
-    after:
+    feature_augment:
       time_masking:
         num_masks: 10
         mask_factor: 100
diff --git a/tests/streaming_transducer/config.yml b/tests/streaming_transducer/config.yml
index ff2c6a4ed5..4f8d3e52d9 100644
--- a/tests/streaming_transducer/config.yml
+++ b/tests/streaming_transducer/config.yml
@@ -53,7 +53,7 @@ model_config:
 
 learning_config:
   augmentations:
-    after:
+    feature_augment:
       time_masking:
         num_masks: 10
         mask_factor: 100

From ccfd924b6fab544258ed6c270c53686f9f8b9298 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Wed, 14 Apr 2021 00:41:38 +0700
Subject: [PATCH 06/13] :writing_hand: update examples scripts

---
 examples/conformer/masking/README.md          |   5 -
 examples/conformer/masking/masking.py         |  32 ----
 .../masking/train_ga_masking_conformer.py     | 131 ----------------
 .../train_ga_masking_subword_conformer.py     | 147 ------------------
 .../masking/train_masking_conformer.py        | 128 ---------------
 .../train_masking_subword_conformer.py        | 143 -----------------
 examples/conformer/masking/trainer.py         |  55 -------
 .../conformer/save_conformer_from_weights.py  |  68 --------
 .../{test_subword_conformer.py => test.py}    |   0
 examples/conformer/test_conformer.py          |  85 ----------
 ...{tflite_subword_conformer.py => tflite.py} |   0
 examples/conformer/tflite_conformer.py        |  66 --------
 ...in_keras_subword_conformer.py => train.py} |  20 +--
 examples/conformer/train_conformer.py         | 106 -------------
 examples/conformer/train_ga_conformer.py      | 108 -------------
 .../conformer/train_ga_subword_conformer.py   | 127 ---------------
 examples/conformer/train_subword_conformer.py | 124 ---------------
 ...eras_subword_conformer.py => train_tpu.py} |  20 +--
 .../{test_subword_contextnet.py => test.py}   |   0
 examples/contextnet/test_contextnet.py        |  85 ----------
 ...tflite_subword_contextnet.py => tflite.py} |   0
 examples/contextnet/tflite_contextnet.py      |  67 --------
 ...n_keras_subword_contextnet.py => train.py} |   0
 examples/contextnet/train_contextnet.py       | 106 -------------
 examples/contextnet/train_ga_contextnet.py    | 108 -------------
 .../contextnet/train_ga_subword_contextnet.py | 122 ---------------
 .../contextnet/train_subword_contextnet.py    | 119 --------------
 examples/deepspeech2/{test_ds2.py => test.py} |   0
 .../{train_keras_ds2.py => train.py}          |   0
 examples/deepspeech2/train_ds2.py             |  88 -----------
 examples/deepspeech2/train_ga_ds2.py          |  91 -----------
 examples/jasper/{test_jasper.py => test.py}   |   0
 .../{train_keras_jasper.py => train.py}       |   0
 examples/jasper/train_ga_jasper.py            |  91 -----------
 examples/jasper/train_jasper.py               |  90 -----------
 ...test_subword_rnn_transducer.py => test.py} |   0
 .../rnn_transducer/test_rnn_transducer.py     |  88 -----------
 ...te_subword_rnn_transducer.py => tflite.py} |   0
 .../rnn_transducer/tflite_rnn_transducer.py   |  70 ---------
 ...ras_subword_rnn_transducer.py => train.py} |  20 +--
 .../rnn_transducer/train_ga_rnn_transducer.py | 100 ------------
 .../train_ga_subword_rnn_transducer.py        | 116 --------------
 .../rnn_transducer/train_rnn_transducer.py    |  97 ------------
 .../train_subword_rnn_transducer.py           | 111 -------------
 44 files changed, 21 insertions(+), 2913 deletions(-)
 delete mode 100644 examples/conformer/masking/README.md
 delete mode 100644 examples/conformer/masking/masking.py
 delete mode 100644 examples/conformer/masking/train_ga_masking_conformer.py
 delete mode 100644 examples/conformer/masking/train_ga_masking_subword_conformer.py
 delete mode 100644 examples/conformer/masking/train_masking_conformer.py
 delete mode 100644 examples/conformer/masking/train_masking_subword_conformer.py
 delete mode 100644 examples/conformer/masking/trainer.py
 delete mode 100644 examples/conformer/save_conformer_from_weights.py
 rename examples/conformer/{test_subword_conformer.py => test.py} (100%)
 mode change 100755 => 100644
 delete mode 100755 examples/conformer/test_conformer.py
 rename examples/conformer/{tflite_subword_conformer.py => tflite.py} (100%)
 delete mode 100644 examples/conformer/tflite_conformer.py
 rename examples/conformer/{train_keras_subword_conformer.py => train.py} (88%)
 delete mode 100644 examples/conformer/train_conformer.py
 delete mode 100644 examples/conformer/train_ga_conformer.py
 delete mode 100644 examples/conformer/train_ga_subword_conformer.py
 delete mode 100644 examples/conformer/train_subword_conformer.py
 rename examples/conformer/{train_tpu_keras_subword_conformer.py => train_tpu.py} (88%)
 rename examples/contextnet/{test_subword_contextnet.py => test.py} (100%)
 delete mode 100644 examples/contextnet/test_contextnet.py
 rename examples/contextnet/{tflite_subword_contextnet.py => tflite.py} (100%)
 delete mode 100644 examples/contextnet/tflite_contextnet.py
 rename examples/contextnet/{train_keras_subword_contextnet.py => train.py} (100%)
 delete mode 100644 examples/contextnet/train_contextnet.py
 delete mode 100644 examples/contextnet/train_ga_contextnet.py
 delete mode 100644 examples/contextnet/train_ga_subword_contextnet.py
 delete mode 100644 examples/contextnet/train_subword_contextnet.py
 rename examples/deepspeech2/{test_ds2.py => test.py} (100%)
 rename examples/deepspeech2/{train_keras_ds2.py => train.py} (100%)
 delete mode 100644 examples/deepspeech2/train_ds2.py
 delete mode 100644 examples/deepspeech2/train_ga_ds2.py
 rename examples/jasper/{test_jasper.py => test.py} (100%)
 rename examples/jasper/{train_keras_jasper.py => train.py} (100%)
 delete mode 100644 examples/jasper/train_ga_jasper.py
 delete mode 100644 examples/jasper/train_jasper.py
 rename examples/rnn_transducer/{test_subword_rnn_transducer.py => test.py} (100%)
 delete mode 100644 examples/rnn_transducer/test_rnn_transducer.py
 rename examples/rnn_transducer/{tflite_subword_rnn_transducer.py => tflite.py} (100%)
 delete mode 100644 examples/rnn_transducer/tflite_rnn_transducer.py
 rename examples/rnn_transducer/{train_keras_subword_rnn_transducer.py => train.py} (88%)
 delete mode 100644 examples/rnn_transducer/train_ga_rnn_transducer.py
 delete mode 100644 examples/rnn_transducer/train_ga_subword_rnn_transducer.py
 delete mode 100644 examples/rnn_transducer/train_rnn_transducer.py
 delete mode 100644 examples/rnn_transducer/train_subword_rnn_transducer.py

diff --git a/examples/conformer/masking/README.md b/examples/conformer/masking/README.md
deleted file mode 100644
index f63d41a3b9..0000000000
--- a/examples/conformer/masking/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Training Conformer with Attention Masking
-
-This is an example for anyone who wants to apply masking in Conformer.
-
-**Note**: This is not a good practice since Conformer uses time reduction, which leads to create incorrect maskings.
\ No newline at end of file
diff --git a/examples/conformer/masking/masking.py b/examples/conformer/masking/masking.py
deleted file mode 100644
index 69f8e0b01a..0000000000
--- a/examples/conformer/masking/masking.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import tensorflow as tf
-from tensorflow_asr.utils.utils import shape_list, get_reduced_length
-
-
-def create_padding_mask(features, input_length, time_reduction_factor):
-    """
-    Create masking with 0 for paddings and 1 for non-paddings
-    Args:
-        features ([tf.Tensor]): audio features with shape [B, T, F, C]
-        input_length ([tf.Tensor]): audio features length with shape [B]
-        time_reduction_factor ([int])
-
-    Returns:
-        [tf.Tensor]: with shape [B, Tquery, Tkey]
-    """
-    batch_size, padded_time, _, _ = shape_list(features)
-    reduced_padded_time = get_reduced_length(padded_time, time_reduction_factor)
-
-    def create_mask(length):
-        reduced_length = get_reduced_length(length, time_reduction_factor)
-        mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32)
-        return tf.pad(
-            mask,
-            [
-                [0, reduced_padded_time - reduced_length],
-                [0, reduced_padded_time - reduced_length]
-            ],
-            mode="CONSTANT",
-            constant_values=0.0
-        )
-
-    return tf.map_fn(create_mask, input_length, fn_output_signature=tf.TensorSpec([None, None], dtype=tf.float32))
diff --git a/examples/conformer/masking/train_ga_masking_conformer.py b/examples/conformer/masking/train_ga_masking_conformer.py
deleted file mode 100644
index 62a0deb240..0000000000
--- a/examples/conformer/masking/train_ga_masking_conformer.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from trainer import TrainerWithMaskingGA
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-
-conformer_trainer = TrainerWithMaskingGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=config.model_config["encoder_dmodel"],
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset,
-                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/conformer/masking/train_ga_masking_subword_conformer.py b/examples/conformer/masking/train_ga_masking_subword_conformer.py
deleted file mode 100644
index 1e74f9a68b..0000000000
--- a/examples/conformer/masking/train_ga_masking_subword_conformer.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None,
-                    help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
-
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from trainer import TrainerWithMaskingGA
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-
-conformer_trainer = TrainerWithMaskingGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=config.model_config["encoder_dmodel"],
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset,
-                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/conformer/masking/train_masking_conformer.py b/examples/conformer/masking/train_masking_conformer.py
deleted file mode 100644
index 82dbbda9ec..0000000000
--- a/examples/conformer/masking/train_masking_conformer.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from trainer import TrainerWithMasking
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-
-conformer_trainer = TrainerWithMasking(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer_config = config.learning_config.optimizer_config
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=config.model_config["encoder_dmodel"],
-            warmup_steps=optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
-        ),
-        beta_1=optimizer_config["beta1"],
-        beta_2=optimizer_config["beta2"],
-        epsilon=optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/conformer/masking/train_masking_subword_conformer.py b/examples/conformer/masking/train_masking_subword_conformer.py
deleted file mode 100644
index be99ec3ceb..0000000000
--- a/examples/conformer/masking/train_masking_subword_conformer.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10,
-                    help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true",
-                    help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None,
-                    help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None,
-                    help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0],
-                    help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true",
-                    help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true",
-                    help="Enable caching for dataset")
-
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[],
-                    help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from trainer import TrainerWithMasking
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRTFRecordDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.train_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        augmentations=config.learning_config.augmentations,
-        stage="train", cache=args.cache, shuffle=True
-    )
-    eval_dataset = ASRSliceDataset(
-        data_paths=config.learning_config.dataset_config.eval_paths,
-        speech_featurizer=speech_featurizer,
-        text_featurizer=text_featurizer,
-        stage="eval", cache=args.cache, shuffle=True
-    )
-
-conformer_trainer = TrainerWithMasking(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=config.model_config["encoder_dmodel"],
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/conformer/masking/trainer.py b/examples/conformer/masking/trainer.py
deleted file mode 100644
index b860eafb2c..0000000000
--- a/examples/conformer/masking/trainer.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import tensorflow as tf
-
-from masking import create_padding_mask
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer, TransducerTrainerGA
-from tensorflow_asr.losses.rnnt_losses import rnnt_loss
-from tensorflow_asr.utils.utils import get_reduced_length
-
-
-class TrainerWithMasking(TransducerTrainer):
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, pred_inp = batch
-
-        mask = create_padding_mask(features, input_length, self.model.time_reduction_factor)
-
-        with tf.GradientTape() as tape:
-            logits = self.model([features, input_length, pred_inp, label_length + 1], training=True, mask=mask)
-            tape.watch(logits)
-            per_train_loss = rnnt_loss(
-                logits=logits, labels=labels, label_length=label_length,
-                logit_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(per_train_loss,
-                                                    global_batch_size=self.global_batch_size)
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
-
-        self.train_metrics["transducer_loss"].update_state(per_train_loss)
-
-
-class TrainerWithMaskingGA(TransducerTrainerGA):
-    @tf.function(experimental_relax_shapes=True)
-    def _train_step(self, batch):
-        _, features, input_length, labels, label_length, pred_inp = batch
-
-        mask = create_padding_mask(features, input_length, self.model.time_reduction_factor)
-
-        with tf.GradientTape() as tape:
-            logits = self.model([features, input_length, pred_inp, label_length + 1], training=True, mask=mask)
-            tape.watch(logits)
-            per_train_loss = rnnt_loss(
-                logits=logits, labels=labels, label_length=label_length,
-                logit_length=get_reduced_length(input_length, self.model.time_reduction_factor),
-                blank=self.text_featurizer.blank
-            )
-            train_loss = tf.nn.compute_average_loss(
-                per_train_loss,
-                global_batch_size=self.global_batch_size
-            )
-
-        gradients = tape.gradient(train_loss, self.model.trainable_variables)
-        self.accumulation.accumulate(gradients)
-        self.train_metrics["transducer_loss"].update_state(per_train_loss)
diff --git a/examples/conformer/save_conformer_from_weights.py b/examples/conformer/save_conformer_from_weights.py
deleted file mode 100644
index bb09c7d329..0000000000
--- a/examples/conformer/save_conformer_from_weights.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
-
-parser.add_argument("--device", type=int, default=0,
-                    help="Device's id to run test on")
-
-parser.add_argument("--cpu", default=False, action="store_true",
-                    help="Whether to only use cpu")
-
-parser.add_argument("output", type=str, default=None,
-                    help="Output to save whole model")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-setup_devices([args.device], cpu=args.cpu)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.conformer import Conformer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-tf.random.set_seed(0)
-assert args.saved
-
-# build model
-conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-conformer._build(speech_featurizer.shape)
-conformer.load_weights(args.saved)
-conformer.summary(line_length=150)
-conformer.save(args.output)
-
-print(f"Saved whole model to {args.output}")
diff --git a/examples/conformer/test_subword_conformer.py b/examples/conformer/test.py
old mode 100755
new mode 100644
similarity index 100%
rename from examples/conformer/test_subword_conformer.py
rename to examples/conformer/test.py
diff --git a/examples/conformer/test_conformer.py b/examples/conformer/test_conformer.py
deleted file mode 100755
index 17f40a6d5f..0000000000
--- a/examples/conformer/test_conformer.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
-
-parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-setup_devices([args.device], cpu=args.cpu)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.conformer import Conformer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-tf.random.set_seed(0)
-assert args.saved
-
-if args.tfrecords:
-    test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-else:
-    test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-
-# build model
-conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-conformer._build(speech_featurizer.shape)
-conformer.load_weights(args.saved)
-conformer.summary(line_length=120)
-conformer.add_featurizers(speech_featurizer, text_featurizer)
-
-conformer_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-conformer_tester.compile(conformer)
-conformer_tester.run(test_dataset)
diff --git a/examples/conformer/tflite_subword_conformer.py b/examples/conformer/tflite.py
similarity index 100%
rename from examples/conformer/tflite_subword_conformer.py
rename to examples/conformer/tflite.py
diff --git a/examples/conformer/tflite_conformer.py b/examples/conformer/tflite_conformer.py
deleted file mode 100644
index a44997a3be..0000000000
--- a/examples/conformer/tflite_conformer.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment
-
-setup_environment()
-import tensorflow as tf
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.conformer import Conformer
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
-
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
-
-args = parser.parse_args()
-
-assert args.saved and args.output
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-# build model
-conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-conformer._build(speech_featurizer.shape)
-conformer.load_weights(args.saved)
-conformer.summary(line_length=150)
-conformer.add_featurizers(speech_featurizer, text_featurizer)
-
-concrete_func = conformer.make_tflite_function().get_concrete_function()
-converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
-tflite_model = converter.convert()
-
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
-with open(args.output, "wb") as tflite_out:
-    tflite_out.write(tflite_model)
diff --git a/examples/conformer/train_keras_subword_conformer.py b/examples/conformer/train.py
similarity index 88%
rename from examples/conformer/train_keras_subword_conformer.py
rename to examples/conformer/train.py
index 7f2219cff2..0c844062a1 100644
--- a/examples/conformer/train_keras_subword_conformer.py
+++ b/examples/conformer/train.py
@@ -46,9 +46,7 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
 
 args = parser.parse_args()
 
@@ -59,7 +57,7 @@
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
 from tensorflow_asr.models.keras.conformer import Conformer
 from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
@@ -68,17 +66,13 @@
 
 if args.sentence_piece:
     print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
     print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
     train_dataset = ASRTFRecordDatasetKeras(
diff --git a/examples/conformer/train_conformer.py b/examples/conformer/train_conformer.py
deleted file mode 100644
index e919f953d3..0000000000
--- a/examples/conformer/train_conformer.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-conformer_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer_config = config.learning_config.optimizer_config
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=conformer.dmodel,
-            warmup_steps=optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(conformer.dmodel))
-        ),
-        beta_1=optimizer_config["beta1"],
-        beta_2=optimizer_config["beta2"],
-        epsilon=optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/conformer/train_ga_conformer.py b/examples/conformer/train_ga_conformer.py
deleted file mode 100644
index d2ca6ade2c..0000000000
--- a/examples/conformer/train_ga_conformer.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-conformer_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=conformer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(conformer.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset,
-                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/conformer/train_ga_subword_conformer.py b/examples/conformer/train_ga_subword_conformer.py
deleted file mode 100644
index c36d1a5468..0000000000
--- a/examples/conformer/train_ga_subword_conformer.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-conformer_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=conformer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(conformer.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset,
-                      train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/conformer/train_subword_conformer.py b/examples/conformer/train_subword_conformer.py
deleted file mode 100644
index 74c143894e..0000000000
--- a/examples/conformer/train_subword_conformer.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-conformer_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with conformer_trainer.strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=conformer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(conformer.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-conformer_trainer.compile(model=conformer, optimizer=optimizer,
-                          max_to_keep=args.max_ckpts)
-
-conformer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/conformer/train_tpu_keras_subword_conformer.py b/examples/conformer/train_tpu.py
similarity index 88%
rename from examples/conformer/train_tpu_keras_subword_conformer.py
rename to examples/conformer/train_tpu.py
index 8162a3bdae..8a0937c985 100644
--- a/examples/conformer/train_tpu_keras_subword_conformer.py
+++ b/examples/conformer/train_tpu.py
@@ -42,9 +42,7 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
 
 parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
@@ -59,7 +57,7 @@
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
 from tensorflow_asr.models.keras.conformer import Conformer
 from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
@@ -68,17 +66,13 @@
 
 if args.sentence_piece:
     print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
     print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
+    print("Use characters...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 train_dataset = ASRTFRecordDatasetKeras(
     speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
diff --git a/examples/contextnet/test_subword_contextnet.py b/examples/contextnet/test.py
similarity index 100%
rename from examples/contextnet/test_subword_contextnet.py
rename to examples/contextnet/test.py
diff --git a/examples/contextnet/test_contextnet.py b/examples/contextnet/test_contextnet.py
deleted file mode 100644
index d62a9bf954..0000000000
--- a/examples/contextnet/test_contextnet.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
-
-parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-setup_devices([args.device], cpu=args.cpu)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.contextnet import ContextNet
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-tf.random.set_seed(0)
-assert args.saved
-
-if args.tfrecords:
-    test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-else:
-    test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-
-# build model
-contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-contextnet._build(speech_featurizer.shape)
-contextnet.load_weights(args.saved)
-contextnet.summary(line_length=120)
-contextnet.add_featurizers(speech_featurizer, text_featurizer)
-
-contextnet_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-contextnet_tester.compile(contextnet)
-contextnet_tester.run(test_dataset)
diff --git a/examples/contextnet/tflite_subword_contextnet.py b/examples/contextnet/tflite.py
similarity index 100%
rename from examples/contextnet/tflite_subword_contextnet.py
rename to examples/contextnet/tflite.py
diff --git a/examples/contextnet/tflite_contextnet.py b/examples/contextnet/tflite_contextnet.py
deleted file mode 100644
index 4452ce8394..0000000000
--- a/examples/contextnet/tflite_contextnet.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment
-
-setup_environment()
-import tensorflow as tf
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.contextnet import ContextNet
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
-
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
-
-args = parser.parse_args()
-
-assert args.saved and args.output
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-# build model
-contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-contextnet._build(speech_featurizer.shape)
-contextnet.load_weights(args.saved)
-contextnet.summary(line_length=150)
-contextnet.add_featurizers(speech_featurizer, text_featurizer)
-
-concrete_func = contextnet.make_tflite_function().get_concrete_function()
-converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
-tflite_model = converter.convert()
-
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
-with open(args.output, "wb") as tflite_out:
-    tflite_out.write(tflite_model)
diff --git a/examples/contextnet/train_keras_subword_contextnet.py b/examples/contextnet/train.py
similarity index 100%
rename from examples/contextnet/train_keras_subword_contextnet.py
rename to examples/contextnet/train.py
diff --git a/examples/contextnet/train_contextnet.py b/examples/contextnet/train_contextnet.py
deleted file mode 100644
index a3a261a3fb..0000000000
--- a/examples/contextnet/train_contextnet.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.contextnet import ContextNet
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-contextnet_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with contextnet_trainer.strategy.scope():
-    # build model
-    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    contextnet._build(speech_featurizer.shape)
-    contextnet.summary(line_length=120)
-
-    optimizer_config = config.learning_config.optimizer_config
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=contextnet.dmodel,
-            warmup_steps=optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
-        ),
-        beta_1=optimizer_config["beta1"],
-        beta_2=optimizer_config["beta2"],
-        epsilon=optimizer_config["epsilon"]
-    )
-
-contextnet_trainer.compile(model=contextnet, optimizer=optimizer,
-                           max_to_keep=args.max_ckpts)
-
-contextnet_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/contextnet/train_ga_contextnet.py b/examples/contextnet/train_ga_contextnet.py
deleted file mode 100644
index d906ce2ba3..0000000000
--- a/examples/contextnet/train_ga_contextnet.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.contextnet import ContextNet
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-contextnet_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with contextnet_trainer.strategy.scope():
-    # build model
-    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    contextnet._build(speech_featurizer.shape)
-    contextnet.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=contextnet.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-contextnet_trainer.compile(model=contextnet, optimizer=optimizer,
-                           max_to_keep=args.max_ckpts)
-
-contextnet_trainer.fit(train_dataset, eval_dataset,
-                       train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/contextnet/train_ga_subword_contextnet.py b/examples/contextnet/train_ga_subword_contextnet.py
deleted file mode 100644
index b1f1cec0f2..0000000000
--- a/examples/contextnet/train_ga_subword_contextnet.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.contextnet import ContextNet
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-contextnet_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with contextnet_trainer.strategy.scope():
-    # build model
-    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    contextnet._build(speech_featurizer.shape)
-    contextnet.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=contextnet.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-contextnet_trainer.compile(model=contextnet, optimizer=optimizer,
-                           max_to_keep=args.max_ckpts)
-
-contextnet_trainer.fit(train_dataset, eval_dataset,
-                       train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/contextnet/train_subword_contextnet.py b/examples/contextnet/train_subword_contextnet.py
deleted file mode 100644
index 74e07e88da..0000000000
--- a/examples/contextnet/train_subword_contextnet.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="ContextNet Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.contextnet import ContextNet
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-contextnet_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with contextnet_trainer.strategy.scope():
-    # build model
-    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    contextnet._build(speech_featurizer.shape)
-    contextnet.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=contextnet.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-contextnet_trainer.compile(model=contextnet, optimizer=optimizer,
-                           max_to_keep=args.max_ckpts)
-
-contextnet_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/deepspeech2/test_ds2.py b/examples/deepspeech2/test.py
similarity index 100%
rename from examples/deepspeech2/test_ds2.py
rename to examples/deepspeech2/test.py
diff --git a/examples/deepspeech2/train_keras_ds2.py b/examples/deepspeech2/train.py
similarity index 100%
rename from examples/deepspeech2/train_keras_ds2.py
rename to examples/deepspeech2/train.py
diff --git a/examples/deepspeech2/train_ds2.py b/examples/deepspeech2/train_ds2.py
deleted file mode 100644
index 8f1d201ed9..0000000000
--- a/examples/deepspeech2/train_ds2.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Deep Speech 2 Training")
-
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.ctc_runners import CTCTrainer
-from tensorflow_asr.models.deepspeech2 import DeepSpeech2
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-ctc_trainer = CTCTrainer(text_featurizer, config.learning_config.running_config)
-# Build DS2 model
-with ctc_trainer.strategy.scope():
-    ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    ds2_model._build(speech_featurizer.shape)
-    ds2_model.summary(line_length=120)
-# Compile
-ctc_trainer.compile(ds2_model, config.learning_config.optimizer_config,
-                    max_to_keep=args.max_ckpts)
-
-ctc_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/deepspeech2/train_ga_ds2.py b/examples/deepspeech2/train_ga_ds2.py
deleted file mode 100644
index 5996859552..0000000000
--- a/examples/deepspeech2/train_ga_ds2.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Deep Speech 2 Training")
-
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.ctc_runners import CTCTrainerGA
-from tensorflow_asr.models.deepspeech2 import DeepSpeech2
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config)
-# Build DS2 model
-with ctc_trainer.strategy.scope():
-    ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    ds2_model._build(speech_featurizer.shape)
-    ds2_model.summary(line_length=120)
-# Compile
-ctc_trainer.compile(ds2_model, config.learning_config.optimizer_config,
-                    max_to_keep=args.max_ckpts)
-
-ctc_trainer.fit(train_dataset, eval_dataset,
-                train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/jasper/test_jasper.py b/examples/jasper/test.py
similarity index 100%
rename from examples/jasper/test_jasper.py
rename to examples/jasper/test.py
diff --git a/examples/jasper/train_keras_jasper.py b/examples/jasper/train.py
similarity index 100%
rename from examples/jasper/train_keras_jasper.py
rename to examples/jasper/train.py
diff --git a/examples/jasper/train_ga_jasper.py b/examples/jasper/train_ga_jasper.py
deleted file mode 100644
index 4697b97e7b..0000000000
--- a/examples/jasper/train_ga_jasper.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Jasper Training")
-
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.ctc_runners import CTCTrainerGA
-from tensorflow_asr.models.jasper import Jasper
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config)
-# Build DS2 model
-with ctc_trainer.strategy.scope():
-    jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    jasper._build(speech_featurizer.shape)
-    jasper.summary(line_length=120)
-# Compile
-ctc_trainer.compile(jasper, config.learning_config.optimizer_config,
-                    max_to_keep=args.max_ckpts)
-
-ctc_trainer.fit(train_dataset, eval_dataset,
-                train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/jasper/train_jasper.py b/examples/jasper/train_jasper.py
deleted file mode 100644
index 528d1eaaa4..0000000000
--- a/examples/jasper/train_jasper.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Jasper Training")
-
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.ctc_runners import CTCTrainer
-from tensorflow_asr.models.jasper import Jasper
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-ctc_trainer = CTCTrainer(text_featurizer, config.learning_config.running_config)
-# Build DS2 model
-with ctc_trainer.strategy.scope():
-    jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    jasper._build(speech_featurizer.shape)
-    jasper.summary(line_length=120)
-# Compile
-ctc_trainer.compile(jasper, config.learning_config.optimizer_config,
-                    max_to_keep=args.max_ckpts)
-
-ctc_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/rnn_transducer/test_subword_rnn_transducer.py b/examples/rnn_transducer/test.py
similarity index 100%
rename from examples/rnn_transducer/test_subword_rnn_transducer.py
rename to examples/rnn_transducer/test.py
diff --git a/examples/rnn_transducer/test_rnn_transducer.py b/examples/rnn_transducer/test_rnn_transducer.py
deleted file mode 100644
index b4ed2f9eee..0000000000
--- a/examples/rnn_transducer/test_rnn_transducer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
-
-parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-setup_devices([args.device], cpu=args.cpu)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-tf.random.set_seed(0)
-assert args.saved
-
-if args.tfrecords:
-    test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-else:
-    test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.test_dataset_config)
-    )
-
-# build model
-streaming_transducer = StreamingTransducer(
-    vocabulary_size=text_featurizer.num_classes,
-    **config.model_config
-)
-streaming_transducer._build(speech_featurizer.shape)
-streaming_transducer.load_weights(args.saved)
-streaming_transducer.summary(line_length=150)
-streaming_transducer.add_featurizers(speech_featurizer, text_featurizer)
-
-streaming_transducer_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-streaming_transducer_tester.compile(streaming_transducer)
-streaming_transducer_tester.run(test_dataset)
diff --git a/examples/rnn_transducer/tflite_subword_rnn_transducer.py b/examples/rnn_transducer/tflite.py
similarity index 100%
rename from examples/rnn_transducer/tflite_subword_rnn_transducer.py
rename to examples/rnn_transducer/tflite.py
diff --git a/examples/rnn_transducer/tflite_rnn_transducer.py b/examples/rnn_transducer/tflite_rnn_transducer.py
deleted file mode 100644
index 6d4627010c..0000000000
--- a/examples/rnn_transducer/tflite_rnn_transducer.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment
-
-setup_environment()
-import tensorflow as tf
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Testing")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
-
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
-
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
-
-args = parser.parse_args()
-
-assert args.saved and args.output
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-# build model
-streaming_transducer = StreamingTransducer(
-    **config.model_config,
-    vocabulary_size=text_featurizer.num_classes
-)
-streaming_transducer._build(speech_featurizer.shape)
-streaming_transducer.load_weights(args.saved)
-streaming_transducer.summary(line_length=150)
-streaming_transducer.add_featurizers(speech_featurizer, text_featurizer)
-
-concrete_func = streaming_transducer.make_tflite_function().get_concrete_function()
-converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
-tflite_model = converter.convert()
-
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
-with open(args.output, "wb") as tflite_out:
-    tflite_out.write(tflite_model)
diff --git a/examples/rnn_transducer/train_keras_subword_rnn_transducer.py b/examples/rnn_transducer/train.py
similarity index 88%
rename from examples/rnn_transducer/train_keras_subword_rnn_transducer.py
rename to examples/rnn_transducer/train.py
index c9254a4fd0..6f7c92c643 100644
--- a/examples/rnn_transducer/train_keras_subword_rnn_transducer.py
+++ b/examples/rnn_transducer/train.py
@@ -43,9 +43,7 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
+parser.add_argument("--subword", default=False, action="store_true", help="Use subword")
 
 args = parser.parse_args()
 
@@ -56,22 +54,18 @@
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
 from tensorflow_asr.models.keras.streaming_transducer import StreamingTransducer
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.subword:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
     train_dataset = ASRTFRecordDatasetKeras(
diff --git a/examples/rnn_transducer/train_ga_rnn_transducer.py b/examples/rnn_transducer/train_ga_rnn_transducer.py
deleted file mode 100644
index 516d9d90e9..0000000000
--- a/examples/rnn_transducer/train_ga_rnn_transducer.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-streaming_transducer_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with streaming_transducer_trainer.strategy.scope():
-    # build model
-    streaming_transducer = StreamingTransducer(
-        **config.model_config,
-        vocabulary_size=text_featurizer.num_classes
-    )
-    streaming_transducer._build(speech_featurizer.shape)
-    streaming_transducer.summary(line_length=150)
-
-    optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config)
-
-streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
-                                     max_to_keep=args.max_ckpts)
-
-streaming_transducer_trainer.fit(train_dataset, eval_dataset,
-                                 train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/rnn_transducer/train_ga_subword_rnn_transducer.py b/examples/rnn_transducer/train_ga_subword_rnn_transducer.py
deleted file mode 100644
index 96b81f4ea1..0000000000
--- a/examples/rnn_transducer/train_ga_subword_rnn_transducer.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--cache", default=False, action="store_true", help="Enable caching for dataset")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainerGA
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-streaming_transducer_trainer = TransducerTrainerGA(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with streaming_transducer_trainer.strategy.scope():
-    # build model
-    streaming_transducer = StreamingTransducer(
-        **config.model_config,
-        vocabulary_size=text_featurizer.num_classes
-    )
-    streaming_transducer._build(speech_featurizer.shape)
-    streaming_transducer.summary(line_length=150)
-
-    optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config)
-
-streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
-                                     max_to_keep=args.max_ckpts)
-
-streaming_transducer_trainer.fit(train_dataset, eval_dataset,
-                                 train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/rnn_transducer/train_rnn_transducer.py b/examples/rnn_transducer/train_rnn_transducer.py
deleted file mode 100644
index 978c613836..0000000000
--- a/examples/rnn_transducer/train_rnn_transducer.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-streaming_transducer_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with streaming_transducer_trainer.strategy.scope():
-    # build model
-    streaming_transducer = StreamingTransducer(
-        **config.model_config,
-        vocabulary_size=text_featurizer.num_classes
-    )
-    streaming_transducer._build(speech_featurizer.shape)
-    streaming_transducer.summary(line_length=150)
-
-    optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config)
-
-streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
-                                     max_to_keep=args.max_ckpts)
-
-streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)
diff --git a/examples/rnn_transducer/train_subword_rnn_transducer.py b/examples/rnn_transducer/train_subword_rnn_transducer.py
deleted file mode 100644
index 14c937349b..0000000000
--- a/examples/rnn_transducer/train_subword_rnn_transducer.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
-
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
-
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
-
-parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_strategy(args.devices)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.transducer_runners import TransducerTrainer
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-if args.tfrecords:
-    train_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-else:
-    train_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config)
-    )
-    eval_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
-    )
-
-streaming_transducer_trainer = TransducerTrainer(
-    config=config.learning_config.running_config,
-    text_featurizer=text_featurizer, strategy=strategy
-)
-
-with streaming_transducer_trainer.strategy.scope():
-    # build model
-    streaming_transducer = StreamingTransducer(
-        **config.model_config,
-        vocabulary_size=text_featurizer.num_classes
-    )
-    streaming_transducer._build(speech_featurizer.shape)
-    streaming_transducer.summary(line_length=150)
-
-    optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config)
-
-streaming_transducer_trainer.compile(model=streaming_transducer, optimizer=optimizer,
-                                     max_to_keep=args.max_ckpts)
-
-streaming_transducer_trainer.fit(train_dataset, eval_dataset, train_bs=args.tbs, eval_bs=args.ebs)

From d86d621b70a1b6eb2576ca88349105ece6cfcbf5 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Thu, 15 Apr 2021 00:30:07 +0700
Subject: [PATCH 07/13] :writing_hand: update conformer training script

---
 examples/conformer/config.yml                 |  32 ++--
 examples/conformer/tflite.py                  |  30 ++--
 examples/conformer/train.py                   |  88 ++++++-----
 examples/conformer/train_tpu.py               | 147 ------------------
 scripts/create_librispeech_trans.py           |   4 +-
 scripts/create_tfrecords.py                   |   4 +-
 scripts/generate_metadata.py                  |   2 +-
 tensorflow_asr/configs/__init__.py            |  33 ----
 tensorflow_asr/configs/config.py              |  13 +-
 tensorflow_asr/datasets/__init__.py           |  17 --
 tensorflow_asr/datasets/asr_dataset.py        |  57 +++----
 tensorflow_asr/datasets/base_dataset.py       |   2 +-
 .../featurizers/methods/gammatone.py          |   2 +-
 .../models/layers/positional_encoding.py      |   2 +-
 tensorflow_asr/models/layers/subsampling.py   |   8 +-
 tensorflow_asr/utils/file_util.py             |  59 ++++---
 tensorflow_asr/utils/math_util.py             |   8 +-
 17 files changed, 161 insertions(+), 347 deletions(-)
 delete mode 100644 examples/conformer/train_tpu.py

diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml
index 79bef5276b..0ee6487e98 100755
--- a/examples/conformer/config.yml
+++ b/examples/conformer/config.yml
@@ -24,14 +24,14 @@ speech_config:
   normalize_per_feature: False
 
 decoder_config:
-  vocabulary: null
+  vocabulary: ./vocabularies/librispeech/librispeech_train_10_1008.subwords
   target_vocab_size: 1000
   max_subword_length: 10
   blank_at_zero: True
-  beam_width: 5
+  beam_width: 0
   norm_score: True
   corpus_files:
-    - /media/nlhuy/Data/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
 
 model_config:
   name: conformer
@@ -40,7 +40,7 @@ model_config:
     filters: 144
     kernel_size: 3
     strides: 2
-  encoder_positional_encoding: sinusoid_concat_v2
+  encoder_positional_encoding: sinusoid_concat
   encoder_dmodel: 144
   encoder_num_blocks: 16
   encoder_head_size: 36
@@ -75,11 +75,10 @@ learning_config:
           num_masks: 1
           mask_factor: 27
     data_paths:
-      - /mnt/Data/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
-    cache_percent: 0.2
     buffer_size: 100
     drop_remainder: True
     stage: train
@@ -87,7 +86,7 @@ learning_config:
   eval_dataset_config:
     use_tf: True
     data_paths: null
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -97,7 +96,7 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths: null
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -106,26 +105,21 @@ learning_config:
 
   optimizer_config:
     warmup_steps: 40000
-    beta1: 0.9
-    beta2: 0.98
+    beta_1: 0.9
+    beta_2: 0.98
     epsilon: 1e-9
 
   running_config:
     batch_size: 2
-    accumulation_steps: 4
     num_epochs: 50
-    outdir: /mnt/Miscellanea/Models/local/conformer
-    log_interval_steps: 300
-    eval_interval_steps: 500
-    save_interval_steps: 1000
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/conformer/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5
       save_best_only: True
       save_weights_only: False
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/conformer/states
+    states_dir: /mnt/e/Models/local/conformer/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/conformer/tensorboard
+      log_dir: /mnt/e/Models/local/conformer/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
diff --git a/examples/conformer/tflite.py b/examples/conformer/tflite.py
index 29794d957e..3159f656ba 100644
--- a/examples/conformer/tflite.py
+++ b/examples/conformer/tflite.py
@@ -14,14 +14,14 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
 from tensorflow_asr.models.conformer import Conformer
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -30,17 +30,13 @@
 
 parser = argparse.ArgumentParser(prog="Conformer Testing")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Use subwords")
 
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
+parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
 
 args = parser.parse_args()
 
@@ -49,17 +45,16 @@
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.subwords:
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 # build model
 conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 conformer._build(speech_featurizer.shape)
 conformer.load_weights(args.saved)
-conformer.summary(line_length=150)
+conformer.summary(line_length=100)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
 concrete_func = conformer.make_tflite_function().get_concrete_function()
@@ -69,7 +64,6 @@
 converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
 tflite_model = converter.convert()
 
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
+args.output = file_util.preprocess_paths(args.output)
 with open(args.output, "wb") as tflite_out:
     tflite_out.write(tflite_model)
diff --git a/examples/conformer/train.py b/examples/conformer/train.py
index 0c844062a1..3b10b3c86e 100644
--- a/examples/conformer/train.py
+++ b/examples/conformer/train.py
@@ -15,9 +15,9 @@
 import os
 import math
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
+from tensorflow_asr.utils import env_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -28,81 +28,86 @@
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
 parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
 parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
 parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
 parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
+
+parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths")
 
 parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
-
 args = parser.parse_args()
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-strategy = setup_strategy(args.devices)
+strategy = env_util.setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
-from tensorflow_asr.models.keras.conformer import Conformer
+from tensorflow_asr.datasets import asr_dataset
+from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
+from tensorflow_asr.models.transducer.conformer import Conformer
 from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
 config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
 
 if args.sentence_piece:
     print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+    text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config)
 elif args.subwords:
     print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer(config.decoder_config)
+    text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config)
 else:
     print("Use characters ...")
-    text_featurizer = CharFeaturizer(config.decoder_config)
+    text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
-    train_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
+    eval_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
+        indefinite=True
     )
-    # Update metadata calculated from both train and eval datasets
-    train_dataset.load_metadata(args.metadata_prefix)
-    eval_dataset.load_metadata(args.metadata_prefix)
-    # Use dynamic length
-    speech_featurizer.reset_length()
-    text_featurizer.reset_length()
 else:
-    train_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.train_dataset_config),
+    eval_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
 
-global_batch_size = config.learning_config.running_config.batch_size
+train_dataset.load_metadata(args.metadata)
+eval_dataset.load_metadata(args.metadata)
+
+if not args.static_length:
+    speech_featurizer.reset_length()
+    text_featurizer.reset_length()
+
+global_batch_size = args.tbs or config.learning_config.running_config.batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
@@ -112,17 +117,15 @@
     # build model
     conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
     conformer._build(speech_featurizer.shape)
-    conformer.summary(line_length=120)
+    conformer.summary(line_length=100)
 
     optimizer = tf.keras.optimizers.Adam(
         TransformerSchedule(
             d_model=conformer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
+            warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
             max_lr=(0.05 / math.sqrt(conformer.dmodel))
         ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
+        **config.learning_config.optimizer_config
     )
 
     conformer.compile(
@@ -139,7 +142,10 @@
 ]
 
 conformer.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+    train_data_loader,
+    epochs=config.learning_config.running_config.num_epochs,
+    validation_data=eval_data_loader,
+    callbacks=callbacks,
+    steps_per_epoch=train_dataset.total_steps,
+    validation_steps=eval_dataset.total_steps
 )
diff --git a/examples/conformer/train_tpu.py b/examples/conformer/train_tpu.py
deleted file mode 100644
index 8a0937c985..0000000000
--- a/examples/conformer/train_tpu.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_tpu
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
-
-parser.add_argument("--bs", type=int, default=None, help="Batch size per replica")
-
-parser.add_argument("--spx", type=int, default=50, help="Steps per execution for maximizing TPU performance")
-
-parser.add_argument("--tpu_address", type=str, default=None, help="TPU address. Leave None on Colab")
-
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
-
-parser.add_argument("--compute_lengths", default=False, action="store_true", help="Whether to compute lengths")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
-
-parser.add_argument("--validation", default=False, action="store_true", help="Enable validation dataset")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_tpu(args.tpu_address)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
-from tensorflow_asr.models.keras.conformer import Conformer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
-elif args.subwords:
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer(config.decoder_config)
-else:
-    print("Use characters...")
-    text_featurizer = CharFeaturizer(config.decoder_config)
-
-train_dataset = ASRTFRecordDatasetKeras(
-    speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-    **vars(config.learning_config.train_dataset_config),
-    indefinite=True
-)
-
-if args.validation:
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config),
-        indefinite=True
-    )
-
-if args.compute_lengths:
-    train_dataset.update_lengths(args.metadata_prefix)
-    if args.validation:
-        eval_dataset.update_lengths(args.metadata_prefix)
-
-# Update metadata calculated from both train and eval datasets
-train_dataset.load_metadata(args.metadata_prefix)
-if args.validation:
-    eval_dataset.load_metadata(args.metadata_prefix)
-
-batch_size = args.bs if args.bs is not None else config.learning_config.running_config.batch_size
-global_batch_size = batch_size
-global_batch_size *= strategy.num_replicas_in_sync
-
-train_data_loader = train_dataset.create(global_batch_size)
-eval_data_loader = eval_dataset.create(global_batch_size) if args.validation else None
-validation_steps = eval_dataset.total_steps if args.validation else None
-
-with strategy.scope():
-    # build model
-    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    conformer._build(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size)
-
-    if args.saved:
-        conformer.load_weights(args.saved, by_name=True, skip_mismatch=True)
-        print('Load pretrained weights successfully')
-
-    conformer.summary(line_length=120)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=conformer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(conformer.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-    conformer.compile(
-        optimizer=optimizer,
-        experimental_steps_per_execution=args.spx,
-        global_batch_size=global_batch_size,
-        blank=text_featurizer.blank
-    )
-
-callbacks = [
-    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),
-    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),
-    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)
-]
-
-conformer.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=validation_steps
-)
diff --git a/scripts/create_librispeech_trans.py b/scripts/create_librispeech_trans.py
index 9a84cb4039..3ad3e7ac5e 100644
--- a/scripts/create_librispeech_trans.py
+++ b/scripts/create_librispeech_trans.py
@@ -19,7 +19,7 @@
 from tqdm.auto import tqdm
 import unicodedata
 
-from tensorflow_asr.utils.utils import preprocess_paths
+from tensorflow_asr.utils.file_util import preprocess_paths
 
 parser = argparse.ArgumentParser(prog="Setup LibriSpeech Transcripts")
 
@@ -31,7 +31,7 @@
 
 assert args.dir and args.output
 
-args.dir = preprocess_paths(args.dir)
+args.dir = preprocess_paths(args.dir, isdir=True)
 args.output = preprocess_paths(args.output)
 
 transcripts = []
diff --git a/scripts/create_tfrecords.py b/scripts/create_tfrecords.py
index 8fe48dcd0e..32a3d520bd 100644
--- a/scripts/create_tfrecords.py
+++ b/scripts/create_tfrecords.py
@@ -15,7 +15,7 @@
 import os
 import argparse
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.utils.utils import preprocess_paths
+from tensorflow_asr.utils.file_util import preprocess_paths
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
 
@@ -40,7 +40,7 @@
 args = parser.parse_args()
 
 transcripts = preprocess_paths(args.transcripts)
-tfrecords_dir = preprocess_paths(args.tfrecords_dir)
+tfrecords_dir = preprocess_paths(args.tfrecords_dir, isdir=True)
 
 config = Config(args.config)
 
diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py
index 395e41effb..48b0315943 100644
--- a/scripts/generate_metadata.py
+++ b/scripts/generate_metadata.py
@@ -15,7 +15,7 @@
 import os
 import argparse
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.utils.utils import preprocess_paths
+from tensorflow_asr.utils.file_util import preprocess_paths
 from tensorflow_asr.datasets.asr_dataset import ASRDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
diff --git a/tensorflow_asr/configs/__init__.py b/tensorflow_asr/configs/__init__.py
index f4d5510355..e69de29bb2 100644
--- a/tensorflow_asr/configs/__init__.py
+++ b/tensorflow_asr/configs/__init__.py
@@ -1,33 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import yaml
-
-
-def load_yaml(path):
-    # Fix yaml numbers https://stackoverflow.com/a/30462009/11037553
-    loader = yaml.SafeLoader
-    loader.add_implicit_resolver(
-        u'tag:yaml.org,2002:float',
-        re.compile(u'''^(?:
-         [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
-        |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
-        |\\.[0-9_]+(?:[eE][-+][0-9]+)?
-        |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
-        |[-+]?\\.(?:inf|Inf|INF)
-        |\\.(?:nan|NaN|NAN))$''', re.X),
-        list(u'-+0123456789.'))
-    with open(path, "r", encoding="utf-8") as file:
-        return yaml.load(file, Loader=loader)
diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py
index da79ddd1f0..028016e853 100644
--- a/tensorflow_asr/configs/config.py
+++ b/tensorflow_asr/configs/config.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import load_yaml
-from ..augmentations.augments import Augmentation
+from ..augmentations.augmentation import Augmentation
 from ..utils import file_util
 
 
@@ -42,14 +41,14 @@ def __init__(self, config: dict = None):
         if not config: config = {}
         self.stage = config.pop("stage", None)
         self.data_paths = file_util.preprocess_paths(config.pop("data_paths", None))
-        self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None))
+        self.tfrecords_dir = file_util.preprocess_paths(config.pop("tfrecords_dir", None), isdir=True)
         self.tfrecords_shards = config.pop("tfrecords_shards", 16)
         self.shuffle = config.pop("shuffle", False)
         self.cache = config.pop("cache", False)
         self.drop_remainder = config.pop("drop_remainder", True)
         self.buffer_size = config.pop("buffer_size", 100)
         self.use_tf = config.pop("use_tf", False)
-        self.augmentations = Augmentation(config.pop("augmentation_config", {}), use_tf=self.use_tf)
+        self.augmentations = Augmentation(config.pop("augmentation_config", {}))
         for k, v in config.items(): setattr(self, k, v)
 
 
@@ -59,10 +58,6 @@ def __init__(self, config: dict = None):
         self.batch_size = config.pop("batch_size", 1)
         self.accumulation_steps = config.pop("accumulation_steps", 1)
         self.num_epochs = config.pop("num_epochs", 20)
-        self.outdir = file_util.preprocess_paths(config.pop("outdir", None))
-        self.log_interval_steps = config.pop("log_interval_steps", 500)
-        self.save_interval_steps = config.pop("save_interval_steps", 500)
-        self.eval_interval_steps = config.pop("eval_interval_steps", 1000)
         for k, v in config.items(): setattr(self, k, v)
 
 
@@ -81,7 +76,7 @@ class Config:
     """ User config class for training, testing or infering """
 
     def __init__(self, path: str):
-        config = load_yaml(file_util.preprocess_paths(path))
+        config = file_util.load_yaml(file_util.preprocess_paths(path))
         self.speech_config = config.pop("speech_config", {})
         self.decoder_config = config.pop("decoder_config", {})
         self.model_config = config.pop("model_config", {})
diff --git a/tensorflow_asr/datasets/__init__.py b/tensorflow_asr/datasets/__init__.py
index f5f8a8a1e8..e69de29bb2 100644
--- a/tensorflow_asr/datasets/__init__.py
+++ b/tensorflow_asr/datasets/__init__.py
@@ -1,17 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .base_dataset import BaseDataset
-from .asr_dataset import ASRDataset, ASRTFRecordDataset, ASRSliceDataset
-__all__ = ['BaseDataset', 'ASRDataset', 'ASRTFRecordDataset', 'ASRSliceDataset']
diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py
index f2d08de6ae..1b6fdca3b6 100755
--- a/tensorflow_asr/datasets/asr_dataset.py
+++ b/tensorflow_asr/datasets/asr_dataset.py
@@ -60,12 +60,15 @@ def compute_metadata(self):
             self.speech_featurizer.update_length(input_length)
             self.text_featurizer.update_length(label_length)
 
-    def save_metadata(self, metadata_prefix: str = None):
-        if metadata_prefix is None: return
-        metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json"
-        if tf.io.gfile.exists(metadata_path):
-            with tf.io.gfile.GFile(metadata_path, "r") as f:
-                content = json.loads(f.read())
+    def save_metadata(self, metadata: str = None):
+        if metadata is None: return
+        metadata = file_util.preprocess_paths(metadata)
+        if tf.io.gfile.exists(metadata):
+            with tf.io.gfile.GFile(metadata, "r") as f:
+                try:
+                    content = json.loads(f.read())
+                except json.JSONDecodeError:
+                    raise ValueError(f'File {metadata} is currently not in json format. Please update the file')
         else:
             content = {}
         content[self.stage] = {
@@ -73,17 +76,20 @@ def save_metadata(self, metadata_prefix: str = None):
             "max_label_length": self.text_featurizer.max_length,
             "num_entries": self.total_steps
         }
-        with tf.io.gfile.GFile(metadata_path, "w") as f:
+        with tf.io.gfile.GFile(metadata, "w") as f:
             f.write(json.dumps(content, indent=2))
-        print(f"metadata written to {metadata_path}")
-
-    def load_metadata(self, metadata_prefix: str = None):
-        if metadata_prefix is None: return
-        metadata_path = file_util.preprocess_paths(metadata_prefix) + ".metadata.json"
-        if tf.io.gfile.exists(metadata_path):
-            print(f"Loading metadata from {metadata_path} ...")
-            with tf.io.gfile.GFile(metadata_path, "r") as f:
-                content = json.loads(f.read()).get(self.stage, {})
+        print(f"Metadata written to {metadata}")
+
+    def load_metadata(self, metadata: str = None):
+        if metadata is None: return
+        metadata = file_util.preprocess_paths(metadata)
+        if tf.io.gfile.exists(metadata):
+            print(f"Loading metadata from {metadata} ...")
+            with tf.io.gfile.GFile(metadata, "r") as f:
+                try:
+                    content = json.loads(f.read()).get(self.stage, {})
+                except json.JSONDecodeError:
+                    raise ValueError(f'File {metadata} must be in json format')
                 self.speech_featurizer.update_length(int(content.get("max_input_length", 0)))
                 self.text_featurizer.update_length(int(content.get("max_label_length", 0)))
                 self.total_steps = int(content.get("num_entries", 0))
@@ -123,19 +129,17 @@ def preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
         with tf.device("/CPU:0"):
             def fn(_path: bytes, _audio: bytes, _indices: bytes):
                 signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate)
-
                 signal = self.augmentations.signal_augment(signal)
-
                 features = self.speech_featurizer.extract(signal.numpy())
-
                 features = self.augmentations.feature_augment(features)
+                features = tf.convert_to_tensor(features, tf.float32)
+                input_length = tf.cast(tf.shape(features)[0], tf.int32)
 
                 label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32)
                 label_length = tf.cast(tf.shape(label)[0], tf.int32)
+
                 prediction = self.text_featurizer.prepand_blank(label)
                 prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32)
-                features = tf.convert_to_tensor(features, tf.float32)
-                input_length = tf.cast(tf.shape(features)[0], tf.int32)
 
                 return _path, features, input_length, label, label_length, prediction, prediction_length
 
@@ -147,19 +151,16 @@ def fn(_path: bytes, _audio: bytes, _indices: bytes):
     def tf_preprocess(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
         with tf.device("/CPU:0"):
             signal = tf_read_raw_audio(audio, self.speech_featurizer.sample_rate)
-
             signal = self.augmentations.signal_augment(signal)
-
             features = self.speech_featurizer.tf_extract(signal)
-
             features = self.augmentations.feature_augment(features)
+            input_length = tf.cast(tf.shape(features)[0], tf.int32)
 
             label = tf.strings.to_number(tf.strings.split(indices), out_type=tf.int32)
             label_length = tf.cast(tf.shape(label)[0], tf.int32)
+
             prediction = self.text_featurizer.prepand_blank(label)
             prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32)
-            features = tf.convert_to_tensor(features, tf.float32)
-            input_length = tf.cast(tf.shape(features)[0], tf.int32)
 
             return path, features, input_length, label, label_length, prediction, prediction_length
 
@@ -190,6 +191,7 @@ def parse(self, path: tf.Tensor, audio: tf.Tensor, indices: tf.Tensor):
 
     def process(self, dataset, batch_size):
         dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)
+        self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder)
 
         if self.cache:
             dataset = dataset.cache()
@@ -197,7 +199,7 @@ def process(self, dataset, batch_size):
         if self.shuffle:
             dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True)
 
-        if self.indefinite:
+        if self.indefinite and self.total_steps:
             dataset = dataset.repeat()
 
         # PADDED BATCH the dataset
@@ -232,7 +234,6 @@ def process(self, dataset, batch_size):
 
         # PREFETCH to improve speed of input length
         dataset = dataset.prefetch(AUTOTUNE)
-        self.total_steps = math_util.get_num_batches(self.total_steps, batch_size, drop_remainders=self.drop_remainder)
         return dataset
 
     def create(self, batch_size: int):
diff --git a/tensorflow_asr/datasets/base_dataset.py b/tensorflow_asr/datasets/base_dataset.py
index 9444d3b85e..3722752b79 100644
--- a/tensorflow_asr/datasets/base_dataset.py
+++ b/tensorflow_asr/datasets/base_dataset.py
@@ -15,7 +15,7 @@
 
 import tensorflow as tf
 
-from ..augmentations.augments import Augmentation
+from ..augmentations.augmentation import Augmentation
 
 BUFFER_SIZE = 100
 TFRECORD_SHARDS = 16
diff --git a/tensorflow_asr/featurizers/methods/gammatone.py b/tensorflow_asr/featurizers/methods/gammatone.py
index dec76d8482..34443efcb7 100644
--- a/tensorflow_asr/featurizers/methods/gammatone.py
+++ b/tensorflow_asr/featurizers/methods/gammatone.py
@@ -16,7 +16,7 @@
 import numpy as np
 import tensorflow as tf
 
-from ..utils.utils import shape_list
+from ...utils.shape_util import shape_list
 
 pi = tf.constant(np.pi, dtype=tf.complex64)
 
diff --git a/tensorflow_asr/models/layers/positional_encoding.py b/tensorflow_asr/models/layers/positional_encoding.py
index 832eb2c491..bf108aa263 100755
--- a/tensorflow_asr/models/layers/positional_encoding.py
+++ b/tensorflow_asr/models/layers/positional_encoding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import tensorflow as tf
-from ...utils.utils import shape_list
+from ...utils.shape_util import shape_list
 
 
 class PositionalEncoding(tf.keras.layers.Layer):
diff --git a/tensorflow_asr/models/layers/subsampling.py b/tensorflow_asr/models/layers/subsampling.py
index 8a84f35205..3e69f4dcdf 100644
--- a/tensorflow_asr/models/layers/subsampling.py
+++ b/tensorflow_asr/models/layers/subsampling.py
@@ -14,7 +14,7 @@
 
 import tensorflow as tf
 
-from ...utils.utils import merge_two_last_dims, shape_list
+from ...utils import shape_util, math_util
 
 
 class TimeReduction(tf.keras.layers.Layer):
@@ -27,7 +27,7 @@ def padding(self, time):
         return tf.cast(new_time, dtype=tf.int32) - time
 
     def call(self, inputs, **kwargs):
-        shape = shape_list(inputs)
+        shape = shape_util.shape_list(inputs)
         outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]])
         outputs = tf.reshape(outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor])
         return outputs
@@ -95,7 +95,7 @@ def call(self, inputs, training=False, **kwargs):
         outputs = tf.nn.relu(outputs)
         outputs = self.maxpool2(outputs, training=training)
 
-        return merge_two_last_dims(outputs)
+        return math_util.merge_two_last_dims(outputs)
 
     def get_config(self):
         conf = super(VggSubsampling, self).get_config()
@@ -137,7 +137,7 @@ def call(self, inputs, training=False, **kwargs):
         outputs = tf.nn.relu(outputs)
         outputs = self.conv2(outputs, training=training)
         outputs = tf.nn.relu(outputs)
-        return merge_two_last_dims(outputs)
+        return math_util.merge_two_last_dims(outputs)
 
     def get_config(self):
         conf = super(Conv2dSubsampling, self).get_config()
diff --git a/tensorflow_asr/utils/file_util.py b/tensorflow_asr/utils/file_util.py
index 0d69315c87..c46363d1ac 100644
--- a/tensorflow_asr/utils/file_util.py
+++ b/tensorflow_asr/utils/file_util.py
@@ -14,16 +14,35 @@
 
 import os
 import re
+import yaml
 import tempfile
+import contextlib
 from typing import Union, List
 import tensorflow as tf
 
 
-def is_hdf5_filepath(filepath):
+def load_yaml(path):
+    # Fix yaml numbers https://stackoverflow.com/a/30462009/11037553
+    loader = yaml.SafeLoader
+    loader.add_implicit_resolver(
+        u'tag:yaml.org,2002:float',
+        re.compile(u'''^(?:
+         [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+        |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+        |\\.[0-9_]+(?:[eE][-+][0-9]+)?
+        |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
+        |[-+]?\\.(?:inf|Inf|INF)
+        |\\.(?:nan|NaN|NAN))$''', re.X),
+        list(u'-+0123456789.'))
+    with open(path, "r", encoding="utf-8") as file:
+        return yaml.load(file, Loader=loader)
+
+
+def is_hdf5_filepath(filepath: str) -> bool:
     return (filepath.endswith('.h5') or filepath.endswith('.keras') or filepath.endswith('.hdf5'))
 
 
-def is_cloud_path(path):
+def is_cloud_path(path: str) -> bool:
     """ Check if the path is on cloud (which requires tf.io.gfile)
 
     Args:
@@ -35,8 +54,8 @@ def is_cloud_path(path):
     return bool(re.match(r"^[a-z]+://", path))
 
 
-def preprocess_paths(paths: Union[List, str]):
-    """Expand the path to the root "/"
+def preprocess_paths(paths: Union[List[str], str], isdir: bool = False) -> Union[List[str], str]:
+    """ Expand the path to the root "/" and makedirs
 
     Args:
         paths (Union[List, str]): A path or list of paths
@@ -45,20 +64,21 @@ def preprocess_paths(paths: Union[List, str]):
         Union[List, str]: A processed path or list of paths, return None if it's not path
     """
     if isinstance(paths, list):
-        return [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths]
-    elif isinstance(paths, str):
-        return paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths))
-    else:
-        return None
-
-
-def read_bytes(path: str) -> tf.Tensor:
-    with tf.io.gfile.GFile(path, "rb") as f:
-        content = f.read()
-    return tf.convert_to_tensor(content, dtype=tf.string)
-
-
-def save_file(filepath):
+        paths = [path if is_cloud_path(path) else os.path.abspath(os.path.expanduser(path)) for path in paths]
+        for path in paths:
+            dirpath = path if isdir else os.path.dirname(path)
+            if not tf.io.gfile.exists(dirpath): tf.io.gfile.makedirs(dirpath)
+        return paths
+    if isinstance(paths, str):
+        paths = paths if is_cloud_path(paths) else os.path.abspath(os.path.expanduser(paths))
+        dirpath = paths if isdir else os.path.dirname(paths)
+        if not tf.io.gfile.exists(dirpath): tf.io.gfile.makedirs(dirpath)
+        return paths
+    return None
+
+
+@contextlib.contextmanager
+def save_file(filepath: str):
     if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
         _, ext = os.path.splitext(filepath)
         with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
@@ -68,7 +88,8 @@ def save_file(filepath):
         yield filepath
 
 
-def read_file(filepath):
+@contextlib.contextmanager
+def read_file(filepath: str):
     if is_cloud_path(filepath) and is_hdf5_filepath(filepath):
         _, ext = os.path.splitext(filepath)
         with tempfile.NamedTemporaryFile(suffix=ext) as tmp:
diff --git a/tensorflow_asr/utils/math_util.py b/tensorflow_asr/utils/math_util.py
index 451a9bcb03..5f613b6e8b 100644
--- a/tensorflow_asr/utils/math_util.py
+++ b/tensorflow_asr/utils/math_util.py
@@ -25,10 +25,10 @@ def log10(x):
     return numerator / denominator
 
 
-def get_num_batches(samples, batch_size, drop_remainders=True):
-    if samples is None or batch_size is None: return None
-    if drop_remainders: return math.floor(float(samples) / float(batch_size))
-    return math.ceil(float(samples) / float(batch_size))
+def get_num_batches(nsamples, batch_size, drop_remainders=True):
+    if nsamples is None or batch_size is None: return None
+    if drop_remainders: return math.floor(float(nsamples) / float(batch_size))
+    return math.ceil(float(nsamples) / float(batch_size))
 
 
 def nan_to_zero(input_tensor):

From 4d07e9c48ea04dab0ed02229135da313cfecaf01 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Fri, 16 Apr 2021 01:24:23 +0700
Subject: [PATCH 08/13] :writing_hand: update testing script

---
 examples/conformer/test.py                    | 62 +++++++++++--------
 tensorflow_asr/losses/ctc_loss.py             | 10 ++-
 tensorflow_asr/losses/rnnt_loss.py            | 10 ++-
 tensorflow_asr/models/base_model.py           |  7 ++-
 tensorflow_asr/models/ctc/ctc.py              | 15 +++--
 .../models/transducer/contextnet.py           | 34 +++-------
 .../models/transducer/rnn_transducer.py       | 30 ++++-----
 .../models/transducer/transducer.py           | 33 ++++------
 tensorflow_asr/utils/env_util.py              |  3 +-
 9 files changed, 91 insertions(+), 113 deletions(-)

diff --git a/examples/conformer/test.py b/examples/conformer/test.py
index da47ab3dc4..4c1f8e13d1 100644
--- a/examples/conformer/test.py
+++ b/examples/conformer/test.py
@@ -14,9 +14,9 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -33,52 +33,57 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
 parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
 parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device], cpu=args.cpu)
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.conformer import Conformer
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
 if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 tf.random.set_seed(0)
-assert args.saved
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
@@ -86,12 +91,19 @@
 conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 conformer._build(speech_featurizer.shape)
 conformer.load_weights(args.saved)
-conformer.summary(line_length=120)
+conformer.summary(line_length=100)
 conformer.add_featurizers(speech_featurizer, text_featurizer)
 
-conformer_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-conformer_tester.compile(conformer)
-conformer_tester.run(test_dataset)
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+results = conformer.predict(test_data_loader)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    print(f"Saving result to {args.output} ...")
+    with open(filepath, "w") as openfile:
+        openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+        for i, entry in test_dataset.entries:
+            groundtruth, greedy, beamsearch = results[i]
+            path, duration, _ = entry
+            openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
diff --git a/tensorflow_asr/losses/ctc_loss.py b/tensorflow_asr/losses/ctc_loss.py
index 6808c57b15..89519a4e60 100644
--- a/tensorflow_asr/losses/ctc_loss.py
+++ b/tensorflow_asr/losses/ctc_loss.py
@@ -21,13 +21,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None):
         self.global_batch_size = global_batch_size
 
     def call(self, y_true, y_pred):
-        logits, logits_length = y_pred.values()
-        labels, labels_length = y_true.values()
         loss = ctc_loss(
-            y_pred=logits,
-            input_length=logits_length,
-            y_true=labels,
-            label_length=labels_length,
+            y_pred=y_pred["logits"],
+            input_length=y_pred["logits_length"],
+            y_true=y_true["labels"],
+            label_length=y_true["labels_length"],
             blank=self.blank,
             name=self.name
         )
diff --git a/tensorflow_asr/losses/rnnt_loss.py b/tensorflow_asr/losses/rnnt_loss.py
index 646ec4586f..85da24dd5f 100644
--- a/tensorflow_asr/losses/rnnt_loss.py
+++ b/tensorflow_asr/losses/rnnt_loss.py
@@ -37,13 +37,11 @@ def __init__(self, blank=0, global_batch_size=None, name=None):
         self.global_batch_size = global_batch_size
 
     def call(self, y_true, y_pred):
-        logits, logits_length = y_pred.values()
-        labels, labels_length = y_true.values()
         loss = rnnt_loss(
-            logits=logits,
-            logit_length=logits_length,
-            labels=labels,
-            label_length=labels_length,
+            logits=y_pred["logits"],
+            logit_length=y_pred["logits_length"],
+            labels=y_true["labels"],
+            label_length=y_true["labels_length"],
             blank=self.blank,
             name=self.name
         )
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
index b8378410e2..f2eab7e0ec 100644
--- a/tensorflow_asr/models/base_model.py
+++ b/tensorflow_asr/models/base_model.py
@@ -111,9 +111,12 @@ def predict_step(self, batch):
             [tf.Tensor]: stacked tensor of shape [B, 3] with each row is the text [truth, greedy, beam_search]
         """
         inputs, y_true = batch
-        labels = self.text_featurizer.iextract(y_true)
+        labels = self.text_featurizer.iextract(y_true["labels"])
         greedy_decoding = self.recognize(inputs)
-        beam_search_decoding = self.recognize_beam(inputs)
+        if self.text_featurizer.decoder_config.beam_width == 0:
+            beam_search_decoding = tf.map_fn(lambda _: tf.convert_to_tensor("", dtype=tf.string), labels)
+        else:
+            beam_search_decoding = self.recognize_beam(inputs)
         return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1)
 
     def recognize(self, features, input_lengths, **kwargs):
diff --git a/tensorflow_asr/models/ctc/ctc.py b/tensorflow_asr/models/ctc/ctc.py
index ab0b60da16..a30c0e166e 100644
--- a/tensorflow_asr/models/ctc/ctc.py
+++ b/tensorflow_asr/models/ctc/ctc.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Union
+from typing import Dict, Union
 import numpy as np
 import tensorflow as tf
 
@@ -69,19 +69,18 @@ def add_featurizers(self,
         self.text_featurizer = text_featurizer
 
     def call(self, inputs, training=False, **kwargs):
-        inputs, inputs_length, _, _ = inputs.values()
-        logits = self.encoder(inputs, training=training, **kwargs)
+        logits = self.encoder(inputs["inputs"], training=training, **kwargs)
         logits = self.decoder(logits, training=training, **kwargs)
         return data_util.create_logits(
             logits=logits,
-            logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor)
+            logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
         )
 
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self, features: tf.Tensor, input_length: Optional[tf.Tensor]):
-        logits = self(features, training=False)
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
+        logits = self(inputs["inputs"], training=False)
         probs = tf.nn.softmax(logits)
 
         def map_fn(prob): return tf.numpy_function(self._perform_greedy, inp=[prob], Tout=tf.string)
@@ -119,8 +118,8 @@ def recognize_tflite(self, signal):
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self, features: tf.Tensor, input_length: Optional[tf.Tensor], lm: bool = False):
-        logits = self(features, training=False)
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
+        logits = self(inputs["inputs"], training=False)
         probs = tf.nn.softmax(logits)
 
         def map_fn(prob): return tf.numpy_function(self._perform_beam_search, inp=[prob, lm], Tout=tf.string)
diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
index 2f47f100ee..bc81f17fe7 100644
--- a/tensorflow_asr/models/transducer/contextnet.py
+++ b/tensorflow_asr/models/transducer/contextnet.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Dict, List
 import tensorflow as tf
 
 from ..encoders.contextnet import ContextNetEncoder, L2
 from .transducer import Transducer
+from ...utils import math_util
 
 
 class ContextNet(Transducer):
@@ -95,11 +96,7 @@ def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
         """
         RNN Transducer Greedy decoding
         Args:
@@ -108,12 +105,9 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_greedy_batch(
-            encoded, input_length,
-            parallel_iterations=parallel_iterations,
-            swap_memory=swap_memory
-        )
+        encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
 
     def recognize_tflite(self, signal, predicted, prediction_states):
         """
@@ -161,12 +155,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, states):
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
         """
         RNN Transducer Beam Search
         Args:
@@ -176,9 +165,6 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder([features, input_length], training=False)
-        return self._perform_beam_search_batch(
-            encoded, input_length, lm,
-            parallel_iterations=parallel_iterations,
-            swap_memory=swap_memory
-        )
+        encoded = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
diff --git a/tensorflow_asr/models/transducer/rnn_transducer.py b/tensorflow_asr/models/transducer/rnn_transducer.py
index 88ef18d80c..b02b9d7113 100644
--- a/tensorflow_asr/models/transducer/rnn_transducer.py
+++ b/tensorflow_asr/models/transducer/rnn_transducer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """ http://arxiv.org/abs/1811.06621 """
 
+from typing import Dict
 import tensorflow as tf
 
 from ..layers.subsampling import TimeReduction
@@ -256,11 +257,7 @@ def encoder_inference(self, features: tf.Tensor, states: tf.Tensor):
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
         """
         RNN Transducer Greedy decoding
         Args:
@@ -269,10 +266,10 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_util.shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
+        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
 
     def recognize_tflite(self, signal, predicted, encoder_states, prediction_states):
         """
@@ -321,12 +318,7 @@ def recognize_tflite_with_timestamp(self, signal, predicted, encoder_states, pre
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
         """
         RNN Transducer Beam Search
         Args:
@@ -336,10 +328,10 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        batch_size, _, _, _ = shape_util.shape_list(features)
-        encoded, _ = self.encoder.recognize(features, self.encoder.get_initial_state(batch_size))
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
+        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
 
     # -------------------------------- TFLITE -------------------------------------
 
diff --git a/tensorflow_asr/models/transducer/transducer.py b/tensorflow_asr/models/transducer/transducer.py
index 8917bf5a3b..a68ce66b84 100644
--- a/tensorflow_asr/models/transducer/transducer.py
+++ b/tensorflow_asr/models/transducer/transducer.py
@@ -14,6 +14,7 @@
 """ https://arxiv.org/pdf/1811.06621.pdf """
 
 import collections
+from typing import Dict
 import tensorflow as tf
 
 from ..base_model import BaseModel
@@ -347,13 +348,12 @@ def compile(self,
         super().compile(loss=loss, optimizer=optimizer, run_eagerly=run_eagerly, **kwargs)
 
     def call(self, inputs, training=False, **kwargs):
-        inputs, inputs_length, predictions, predictions_length = inputs.values()
-        enc = self.encoder(inputs, training=training, **kwargs)
-        pred = self.predict_net([predictions, predictions_length], training=training, **kwargs)
+        enc = self.encoder(inputs["inputs"], training=training, **kwargs)
+        pred = self.predict_net([inputs["predictions"], inputs["predictions_length"]], training=training, **kwargs)
         logits = self.joint_net([enc, pred], training=training, **kwargs)
         return data_util.create_logits(
             logits=logits,
-            logits_length=math_util.get_reduced_length(inputs_length, self.time_reduction_factor)
+            logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
         )
 
     # -------------------------------- INFERENCES -------------------------------------
@@ -400,11 +400,7 @@ def get_config(self):
     # -------------------------------- GREEDY -------------------------------------
 
     @tf.function
-    def recognize(self,
-                  features: tf.Tensor,
-                  input_length: tf.Tensor,
-                  parallel_iterations: int = 10,
-                  swap_memory: bool = True):
+    def recognize(self, inputs: Dict[str, tf.Tensor]):
         """
         RNN Transducer Greedy decoding
         Args:
@@ -414,9 +410,9 @@ def recognize(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder(features, training=False)
-        return self._perform_greedy_batch(encoded, input_length,
-                                          parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        encoded = self.encoder(inputs["inputs"], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
 
     def recognize_tflite(self, signal, predicted, states):
         """
@@ -600,12 +596,7 @@ def body(_time, _hypothesis):
     # -------------------------------- BEAM SEARCH -------------------------------------
 
     @tf.function
-    def recognize_beam(self,
-                       features: tf.Tensor,
-                       input_length: tf.Tensor,
-                       lm: bool = False,
-                       parallel_iterations: int = 10,
-                       swap_memory: bool = True):
+    def recognize_beam(self, inputs: Dict[str, tf.Tensor], lm: bool = False):
         """
         RNN Transducer Beam Search
         Args:
@@ -615,9 +606,9 @@ def recognize_beam(self,
         Returns:
             tf.Tensor: a batch of decoded transcripts
         """
-        encoded = self.encoder(features, training=False)
-        return self._perform_beam_search_batch(encoded, input_length, lm,
-                                               parallel_iterations=parallel_iterations, swap_memory=swap_memory)
+        encoded = self.encoder(inputs["inputs"], training=False)
+        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
 
     def _perform_beam_search_batch(self,
                                    encoded: tf.Tensor,
diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py
index 2bf4970415..c5564b543e 100644
--- a/tensorflow_asr/utils/env_util.py
+++ b/tensorflow_asr/utils/env_util.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import tensorflow as tf
 
 
 def setup_environment():  # Set memory growth and only log ERRORs
     """ Setting tensorflow running environment """
-    import warnings
     warnings.simplefilter("ignore")
     tf.get_logger().setLevel("ERROR")
-    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
 
 
 def setup_devices(devices, cpu=False):

From 4dbbb175fd723e778a5008ca7d11d76426679e0b Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 17 Apr 2021 13:25:43 +0700
Subject: [PATCH 09/13] :writing_hand: update testing functions and scripts

---
 examples/conformer/test.py            | 27 ++++++++++------
 tensorflow_asr/metrics/error_rates.py |  2 +-
 tensorflow_asr/models/base_model.py   | 16 +++++++---
 tensorflow_asr/utils/app_util.py      | 45 +++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow_asr/utils/app_util.py

diff --git a/examples/conformer/test.py b/examples/conformer/test.py
index 4c1f8e13d1..12874187ba 100644
--- a/examples/conformer/test.py
+++ b/examples/conformer/test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+from tqdm import tqdm
 import argparse
 from tensorflow_asr.utils import env_util, file_util
 
@@ -58,6 +59,7 @@
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
 from tensorflow_asr.models.transducer.conformer import Conformer
+from tensorflow_asr.utils import app_util
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
@@ -97,13 +99,20 @@
 batch_size = args.bs or config.learning_config.running_config.batch_size
 test_data_loader = test_dataset.create(batch_size)
 
-results = conformer.predict(test_data_loader)
-
 with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
-    print(f"Saving result to {args.output} ...")
-    with open(filepath, "w") as openfile:
-        openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
-        for i, entry in test_dataset.entries:
-            groundtruth, greedy, beamsearch = results[i]
-            path, duration, _ = entry
-            openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+    overwrite = False
+    if tf.io.gfile.exists(filepath):
+        overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y"
+    if overwrite:
+        results = conformer.predict(test_data_loader, verbose=1)
+        print(f"Saving result to {args.output} ...")
+        with open(filepath, "w") as openfile:
+            openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+            progbar = tqdm(total=test_dataset.total_steps, unit="batch")
+            for i, pred in enumerate(results):
+                groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred]
+                path, duration, _ = test_dataset.entries[i]
+                openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+                progbar.update(1)
+            progbar.close()
+    app_util.evaluate_results(filepath)
diff --git a/tensorflow_asr/metrics/error_rates.py b/tensorflow_asr/metrics/error_rates.py
index 143e199109..2d6880e35e 100644
--- a/tensorflow_asr/metrics/error_rates.py
+++ b/tensorflow_asr/metrics/error_rates.py
@@ -30,4 +30,4 @@ def update_state(self, decode: tf.Tensor, target: tf.Tensor):
         self.denominator.assign_add(d)
 
     def result(self):
-        return tf.math.divide_no_nan(self.numerator, self.denominator) * 100
+        return tf.math.divide_no_nan(self.numerator, self.denominator)
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
index f2eab7e0ec..1ebf8787e5 100644
--- a/tensorflow_asr/models/base_model.py
+++ b/tensorflow_asr/models/base_model.py
@@ -19,6 +19,10 @@
 
 
 class BaseModel(tf.keras.Model):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._metrics = {}
+
     def save(self,
              filepath,
              overwrite=True,
@@ -66,7 +70,10 @@ def load_weights(self,
 
     @property
     def metrics(self):
-        return [self.loss_metric]
+        return self._metrics.values()
+
+    def add_metric(self, metric: tf.keras.metrics.Metric):
+        self._metrics.append({metric.name: metric})
 
     def _build(self, *args, **kwargs):
         raise NotImplementedError()
@@ -76,7 +83,8 @@ def compile(self, loss, optimizer, run_eagerly=None, **kwargs):
         if not env_util.has_tpu():
             optimizer = mxp.experimental.LossScaleOptimizer(tf.keras.optimizers.get(optimizer), "dynamic")
             self.use_loss_scale = True
-        self.loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
+        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
+        self._metrics = {loss_metric.name: loss_metric}
         super().compile(optimizer=optimizer, loss=loss, run_eagerly=run_eagerly, **kwargs)
 
     # -------------------------------- STEP FUNCTIONS -------------------------------------
@@ -92,14 +100,14 @@ def train_step(self, batch):
         if self.use_loss_scale:
             gradients = self.optimizer.get_unscaled_gradients(gradients)
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-        self.loss_metric.update_state(loss)
+        self._metrics["loss"].update_state(loss)
         return {m.name: m.result() for m in self.metrics}
 
     def test_step(self, batch):
         inputs, y_true = batch
         y_pred = self(inputs, training=False)
         loss = self.loss(y_true, y_pred)
-        self.loss_metric.update_state(loss)
+        self._metrics["loss"].update_state(loss)
         return {m.name: m.result() for m in self.metrics}
 
     def predict_step(self, batch):
diff --git a/tensorflow_asr/utils/app_util.py b/tensorflow_asr/utils/app_util.py
new file mode 100644
index 0000000000..b996a8030f
--- /dev/null
+++ b/tensorflow_asr/utils/app_util.py
@@ -0,0 +1,45 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tqdm import tqdm
+import tensorflow as tf
+
+from .metric_util import wer, cer
+from ..metrics.error_rates import ErrorRate
+from .file_util import read_file
+
+
+def evaluate_results(filepath: str):
+    print(f"Evaluating result from {filepath} ...")
+    metrics = {
+        "greedy_wer": ErrorRate(wer, name="greedy_wer", dtype=tf.float32),
+        "greedy_cer": ErrorRate(cer, name="greedy_cer", dtype=tf.float32),
+        "beamsearch_wer": ErrorRate(wer, name="beamsearch_wer", dtype=tf.float32),
+        "beamsearch_cer": ErrorRate(cer, name="beamsearch_cer", dtype=tf.float32)
+    }
+    with read_file(filepath) as path:
+        with open(path, "r", encoding="utf-8") as openfile:
+            lines = openfile.read().splitlines()
+            lines = lines[1:]  # skip header
+    for eachline in tqdm(lines):
+        _, _, groundtruth, greedy, beamsearch = eachline.split("\t")
+        groundtruth = tf.convert_to_tensor([groundtruth], dtype=tf.string)
+        greedy = tf.convert_to_tensor([greedy], dtype=tf.string)
+        beamsearch = tf.convert_to_tensor([beamsearch], dtype=tf.string)
+        metrics["greedy_wer"].update_state(decode=greedy, target=groundtruth)
+        metrics["greedy_cer"].update_state(decode=greedy, target=groundtruth)
+        metrics["beamsearch_wer"].update_state(decode=beamsearch, target=groundtruth)
+        metrics["beamsearch_cer"].update_state(decode=beamsearch, target=groundtruth)
+    for key, value in metrics.items():
+        print(f"{key}: {value.result().numpy()}")

From 51d8c5524bf44d9e7841ae48378fd4de801576a9 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 17 Apr 2021 13:41:34 +0700
Subject: [PATCH 10/13] :writing_hand: update example scripts

---
 examples/conformer/tflite.py                  |   4 +-
 examples/contextnet/test.py                   |  74 ++++++---
 examples/contextnet/tflite.py                 |  38 ++---
 examples/contextnet/train.py                  |  98 ++++++------
 .../train_tpu_keras_subword_contextnet.py     | 144 ------------------
 examples/deepspeech2/test.py                  |  94 ++++++++----
 examples/deepspeech2/tflite.py                |  69 +++++++++
 examples/deepspeech2/train.py                 | 106 ++++++++-----
 examples/demonstration/conformer.py           |  11 +-
 examples/jasper/test.py                       |  92 +++++++----
 examples/jasper/tflite.py                     |  69 +++++++++
 examples/jasper/train.py                      |  93 ++++++-----
 examples/rnn_transducer/test.py               |  87 +++++++----
 examples/rnn_transducer/tflite.py             |  53 +++----
 examples/rnn_transducer/train.py              | 113 ++++++++------
 15 files changed, 655 insertions(+), 490 deletions(-)
 delete mode 100644 examples/contextnet/train_tpu_keras_subword_contextnet.py
 create mode 100644 examples/deepspeech2/tflite.py
 create mode 100644 examples/jasper/tflite.py

diff --git a/examples/conformer/tflite.py b/examples/conformer/tflite.py
index 3159f656ba..b0d40b0679 100644
--- a/examples/conformer/tflite.py
+++ b/examples/conformer/tflite.py
@@ -22,13 +22,13 @@
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
-from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.models.transducer.conformer import Conformer
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Conformer Testing")
+parser = argparse.ArgumentParser(prog="Conformer TFLite")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
diff --git a/examples/contextnet/test.py b/examples/contextnet/test.py
index 0aaabce52b..afa6c6211b 100644
--- a/examples/contextnet/test.py
+++ b/examples/contextnet/test.py
@@ -13,17 +13,18 @@
 # limitations under the License.
 
 import os
+from tqdm import tqdm
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="ContextNet Testing")
+parser = argparse.ArgumentParser(prog="Contextnet Testing")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
@@ -33,47 +34,58 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
 parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device], cpu=args.cpu)
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.contextnet import ContextNet
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.contextnet import ContextNet
+from tensorflow_asr.utils import app_util
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.sentence_piece:
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 tf.random.set_seed(0)
-assert args.saved
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
@@ -81,12 +93,26 @@
 contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 contextnet._build(speech_featurizer.shape)
 contextnet.load_weights(args.saved)
-contextnet.summary(line_length=120)
+contextnet.summary(line_length=100)
 contextnet.add_featurizers(speech_featurizer, text_featurizer)
 
-contextnet_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-contextnet_tester.compile(contextnet)
-contextnet_tester.run(test_dataset)
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    overwrite = False
+    if tf.io.gfile.exists(filepath):
+        overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y"
+    if overwrite:
+        results = contextnet.predict(test_data_loader, verbose=1)
+        print(f"Saving result to {args.output} ...")
+        with open(filepath, "w") as openfile:
+            openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+            progbar = tqdm(total=test_dataset.total_steps, unit="batch")
+            for i, pred in enumerate(results):
+                groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred]
+                path, duration, _ = test_dataset.entries[i]
+                openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+                progbar.update(1)
+            progbar.close()
+    app_util.evaluate_results(filepath)
diff --git a/examples/contextnet/tflite.py b/examples/contextnet/tflite.py
index a76e4a6b78..0e8852cb19 100644
--- a/examples/contextnet/tflite.py
+++ b/examples/contextnet/tflite.py
@@ -14,33 +14,29 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.models.contextnet import ContextNet
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.contextnet import ContextNet
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="ContextNet Testing")
+parser = argparse.ArgumentParser(prog="ContextNet TFLite")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Use subwords")
 
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
+parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
 
 args = parser.parse_args()
 
@@ -49,27 +45,25 @@
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.subwords:
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 # build model
 contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
 contextnet._build(speech_featurizer.shape)
 contextnet.load_weights(args.saved)
-contextnet.summary(line_length=150)
+contextnet.summary(line_length=100)
 contextnet.add_featurizers(speech_featurizer, text_featurizer)
 
 concrete_func = contextnet.make_tflite_function().get_concrete_function()
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+converter.experimental_new_converter = True
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
 tflite_model = converter.convert()
 
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
+args.output = file_util.preprocess_paths(args.output)
 with open(args.output, "wb") as tflite_out:
     tflite_out.write(tflite_model)
diff --git a/examples/contextnet/train.py b/examples/contextnet/train.py
index 4046cfb858..7644fdeabe 100644
--- a/examples/contextnet/train.py
+++ b/examples/contextnet/train.py
@@ -15,96 +15,99 @@
 import os
 import math
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
+from tensorflow_asr.utils import env_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="ContextNet Training")
+parser = argparse.ArgumentParser(prog="Contextnet Training")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
 parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
 parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
 parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
+
+parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths")
 
 parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
 args = parser.parse_args()
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-strategy = setup_strategy(args.devices)
+strategy = env_util.setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.models.keras.contextnet import ContextNet
+from tensorflow_asr.datasets import asr_dataset
+from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
+from tensorflow_asr.models.transducer.contextnet import ContextNet
 from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
 config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
+if args.sentence_piece:
+    print("Loading SentencePiece model ...")
+    text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
     print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config)
 else:
-    print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
+    print("Use characters ...")
+    text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
-    train_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    eval_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
-    # Update metadata calculated from both train and eval datasets
-    train_dataset.load_metadata(args.metadata_prefix)
-    eval_dataset.load_metadata(args.metadata_prefix)
-    # Use dynamic length
-    speech_featurizer.reset_length()
-    text_featurizer.reset_length()
 else:
-    train_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    eval_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
 
-global_batch_size = config.learning_config.running_config.batch_size
+train_dataset.load_metadata(args.metadata)
+eval_dataset.load_metadata(args.metadata)
+
+if not args.static_length:
+    speech_featurizer.reset_length()
+    text_featurizer.reset_length()
+
+global_batch_size = args.tbs or config.learning_config.running_config.batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
@@ -114,17 +117,15 @@
     # build model
     contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
     contextnet._build(speech_featurizer.shape)
-    contextnet.summary(line_length=120)
+    contextnet.summary(line_length=100)
 
     optimizer = tf.keras.optimizers.Adam(
         TransformerSchedule(
             d_model=contextnet.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
+            warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
             max_lr=(0.05 / math.sqrt(contextnet.dmodel))
         ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
+        **config.learning_config.optimizer_config
     )
 
     contextnet.compile(
@@ -141,7 +142,10 @@
 ]
 
 contextnet.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+    train_data_loader,
+    epochs=config.learning_config.running_config.num_epochs,
+    validation_data=eval_data_loader,
+    callbacks=callbacks,
+    steps_per_epoch=train_dataset.total_steps,
+    validation_steps=eval_dataset.total_steps
 )
diff --git a/examples/contextnet/train_tpu_keras_subword_contextnet.py b/examples/contextnet/train_tpu_keras_subword_contextnet.py
deleted file mode 100644
index f0bc5e64a8..0000000000
--- a/examples/contextnet/train_tpu_keras_subword_contextnet.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2020 Huy Le Nguyen (@usimarit)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import argparse
-from tensorflow_asr.utils import setup_environment, setup_tpu
-
-setup_environment()
-import tensorflow as tf
-
-DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
-
-tf.keras.backend.clear_session()
-
-parser = argparse.ArgumentParser(prog="Conformer Training")
-
-parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
-
-parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
-
-parser.add_argument("--bs", type=int, default=None, help="Batch size per replica")
-
-parser.add_argument("--spx", type=int, default=50, help="Steps per execution for maximizing TPU performance")
-
-parser.add_argument("--tpu_address", type=str, default=None, help="TPU address. Leave None on Colab")
-
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
-
-parser.add_argument("--compute_lengths", default=False, action="store_true", help="Whether to compute lengths")
-
-parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
-
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords")
-
-parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
-
-args = parser.parse_args()
-
-tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
-
-strategy = setup_tpu(args.tpu_address)
-
-from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import TFSubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.models.keras.contextnet import ContextNet
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
-
-config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.sentence_piece:
-    print("Loading SentencePiece model ...")
-    text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
-elif args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = TFSubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
-else:
-    print("Generating subwords ...")
-    text_featurizer = TFSubwordFeaturizer.build_from_corpus(
-        config.decoder_config,
-        corpus_files=args.subwords_corpus
-    )
-    text_featurizer.save_to_file(args.subwords)
-
-train_dataset = ASRTFRecordDatasetKeras(
-    speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-    **vars(config.learning_config.train_dataset_config),
-    indefinite=True
-)
-eval_dataset = ASRTFRecordDatasetKeras(
-    speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-    **vars(config.learning_config.eval_dataset_config),
-    indefinite=True
-)
-
-if args.compute_lengths:
-    train_dataset.update_lengths(args.metadata_prefix)
-    eval_dataset.update_lengths(args.metadata_prefix)
-
-# Update metadata calculated from both train and eval datasets
-train_dataset.load_metadata(args.metadata_prefix)
-eval_dataset.load_metadata(args.metadata_prefix)
-
-batch_size = args.bs if args.bs is not None else config.learning_config.running_config.batch_size
-global_batch_size = batch_size
-global_batch_size *= strategy.num_replicas_in_sync
-
-train_data_loader = train_dataset.create(global_batch_size)
-eval_data_loader = eval_dataset.create(global_batch_size)
-
-with strategy.scope():
-    # build model
-    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    contextnet._build(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size)
-    contextnet.summary(line_length=120)
-
-    if args.saved:
-        contextnet.load_weights(args.saved, by_name=True, skip_mismatch=True)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=contextnet.dmodel,
-            warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
-            max_lr=(0.05 / math.sqrt(contextnet.dmodel))
-        ),
-        beta_1=config.learning_config.optimizer_config["beta1"],
-        beta_2=config.learning_config.optimizer_config["beta2"],
-        epsilon=config.learning_config.optimizer_config["epsilon"]
-    )
-
-    contextnet.compile(
-        optimizer=optimizer,
-        experimental_steps_per_execution=args.spx,
-        global_batch_size=global_batch_size,
-        blank=text_featurizer.blank
-    )
-
-callbacks = [
-    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),
-    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),
-    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)
-]
-
-contextnet.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
-)
diff --git a/examples/deepspeech2/test.py b/examples/deepspeech2/test.py
index 096add656f..d475be31c1 100644
--- a/examples/deepspeech2/test.py
+++ b/examples/deepspeech2/test.py
@@ -13,70 +13,106 @@
 # limitations under the License.
 
 import os
+from tqdm import tqdm
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Deep Speech 2 Tester")
+parser = argparse.ArgumentParser(prog="DeepSpeech2 Testing")
 
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None, help="Path to the model file to be exported")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
+
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device])
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.deepspeech2 import DeepSpeech2
-
-tf.random.set_seed(0)
-assert args.export
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2
+from tensorflow_asr.utils import app_util
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-# Build DS2 model
-ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-ds2_model._build(speech_featurizer.shape)
-ds2_model.load_weights(args.saved)
-ds2_model.summary(line_length=120)
-ds2_model.add_featurizers(speech_featurizer, text_featurizer)
+
+if args.sentence_piece:
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
+else:
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
+
+tf.random.set_seed(0)
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
-ctc_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-ctc_tester.compile(ds2_model)
-ctc_tester.run(test_dataset)
+# build model
+deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+deepspeech2._build(speech_featurizer.shape)
+deepspeech2.load_weights(args.saved)
+deepspeech2.summary(line_length=100)
+deepspeech2.add_featurizers(speech_featurizer, text_featurizer)
+
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    overwrite = False
+    if tf.io.gfile.exists(filepath):
+        overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y"
+    if overwrite:
+        results = deepspeech2.predict(test_data_loader, verbose=1)
+        print(f"Saving result to {args.output} ...")
+        with open(filepath, "w") as openfile:
+            openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+            progbar = tqdm(total=test_dataset.total_steps, unit="batch")
+            for i, pred in enumerate(results):
+                groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred]
+                path, duration, _ = test_dataset.entries[i]
+                openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+                progbar.update(1)
+            progbar.close()
+    app_util.evaluate_results(filepath)
diff --git a/examples/deepspeech2/tflite.py b/examples/deepspeech2/tflite.py
new file mode 100644
index 0000000000..81980e1fb2
--- /dev/null
+++ b/examples/deepspeech2/tflite.py
@@ -0,0 +1,69 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+from tensorflow_asr.utils import env_util, file_util
+
+env_util.setup_environment()
+import tensorflow as tf
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
+from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="DeepSpeech2 TFLite")
+
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
+
+parser.add_argument("--subwords", type=str, default=None, help="Use subwords")
+
+parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
+
+args = parser.parse_args()
+
+assert args.saved and args.output
+
+config = Config(args.config)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+
+if args.subwords:
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
+else:
+    text_featurizer = CharFeaturizer(config.decoder_config)
+
+# build model
+deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+deepspeech2._build(speech_featurizer.shape)
+deepspeech2.load_weights(args.saved)
+deepspeech2.summary(line_length=100)
+deepspeech2.add_featurizers(speech_featurizer, text_featurizer)
+
+concrete_func = deepspeech2.make_tflite_function().get_concrete_function()
+converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+converter.experimental_new_converter = True
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+
+args.output = file_util.preprocess_paths(args.output)
+with open(args.output, "wb") as tflite_out:
+    tflite_out.write(tflite_model)
diff --git a/examples/deepspeech2/train.py b/examples/deepspeech2/train.py
index 49e0b83d95..3f3e5972c5 100644
--- a/examples/deepspeech2/train.py
+++ b/examples/deepspeech2/train.py
@@ -14,28 +14,34 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
+from tensorflow_asr.utils import env_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Deep Speech 2 Training")
+parser = argparse.ArgumentParser(prog="DeepSpeech2 Training")
 
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
+
+parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance")
+
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
+
+parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths")
 
 parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
@@ -45,59 +51,72 @@
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-strategy = setup_strategy(args.devices)
+strategy = env_util.setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.keras.deepspeech2 import DeepSpeech2
+from tensorflow_asr.datasets import asr_dataset
+from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
+from tensorflow_asr.models.ctc.deepspeech2 import DeepSpeech2
 
 config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
+speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
+
+if args.sentence_piece:
+    print("Loading SentencePiece model ...")
+    text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Loading subwords ...")
+    text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config)
+else:
+    print("Use characters ...")
+    text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
-    train_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
+    eval_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
+        indefinite=True
     )
-    # Update metadata calculated from both train and eval datasets
-    train_dataset.load_metadata(args.metadata_prefix)
-    eval_dataset.load_metadata(args.metadata_prefix)
-    # Use dynamic length
-    speech_featurizer.reset_length()
-    text_featurizer.reset_length()
 else:
-    train_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    eval_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
 
-global_batch_size = config.learning_config.running_config.batch_size
+train_dataset.load_metadata(args.metadata)
+eval_dataset.load_metadata(args.metadata)
+
+if not args.static_length:
+    speech_featurizer.reset_length()
+    text_featurizer.reset_length()
+
+global_batch_size = args.tbs or config.learning_config.running_config.batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
 eval_data_loader = eval_dataset.create(global_batch_size)
 
-# Build DS2 model
 with strategy.scope():
-    ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-    ds2_model._build(speech_featurizer.shape)
-    ds2_model.summary(line_length=120)
-
-    ds2_model.compile(
+    # build model
+    deepspeech2 = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    deepspeech2._build(speech_featurizer.shape)
+    deepspeech2.summary(line_length=100)
+    deepspeech2.compile(
         optimizer=config.learning_config.optimizer_config,
         experimental_steps_per_execution=args.spx,
         global_batch_size=global_batch_size,
@@ -110,8 +129,11 @@
     tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)
 ]
 
-ds2_model.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+deepspeech2.fit(
+    train_data_loader,
+    epochs=config.learning_config.running_config.num_epochs,
+    validation_data=eval_data_loader,
+    callbacks=callbacks,
+    steps_per_epoch=train_dataset.total_steps,
+    validation_steps=eval_dataset.total_steps
 )
diff --git a/examples/demonstration/conformer.py b/examples/demonstration/conformer.py
index 7e0a280f53..1870a9777c 100644
--- a/examples/demonstration/conformer.py
+++ b/examples/demonstration/conformer.py
@@ -14,10 +14,9 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
-from tensorflow_asr.utils.utils import get_reduced_length
+from tensorflow_asr.utils import env_util, math_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 parser = argparse.ArgumentParser(prog="Conformer non streaming")
@@ -42,13 +41,13 @@
 
 args = parser.parse_args()
 
-setup_devices([args.device], cpu=args.cpu)
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
 from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer, SentencePieceFeaturizer
-from tensorflow_asr.models.conformer import Conformer
+from tensorflow_asr.models.transducer.conformer import Conformer
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
@@ -71,7 +70,7 @@
 
 signal = read_raw_audio(args.filename)
 features = speech_featurizer.tf_extract(signal)
-input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor)
+input_length = math_util.get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor)
 
 if args.beam_width:
     transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...])
diff --git a/examples/jasper/test.py b/examples/jasper/test.py
index 48cabaf808..06e7c98ede 100644
--- a/examples/jasper/test.py
+++ b/examples/jasper/test.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import os
+from tqdm import tqdm
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -25,58 +26,93 @@
 
 parser = argparse.ArgumentParser(prog="Jasper Testing")
 
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None, help="Path to the model file to be exported")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords as dataset")
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
+
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device])
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.jasper import Jasper
-
-tf.random.set_seed(0)
-assert args.export
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.ctc.jasper import Jasper
+from tensorflow_asr.utils import app_util
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
-# Build DS2 model
-jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
-jasper._build(speech_featurizer.shape)
-jasper.load_weights(args.saved)
-jasper.summary(line_length=120)
-jasper.add_featurizers(speech_featurizer, text_featurizer)
+
+if args.sentence_piece:
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
+else:
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
+
+tf.random.set_seed(0)
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
-ctc_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-ctc_tester.compile(jasper)
-ctc_tester.run(test_dataset)
+# build model
+jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+jasper._build(speech_featurizer.shape)
+jasper.load_weights(args.saved)
+jasper.summary(line_length=100)
+jasper.add_featurizers(speech_featurizer, text_featurizer)
+
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    overwrite = False
+    if tf.io.gfile.exists(filepath):
+        overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y"
+    if overwrite:
+        results = jasper.predict(test_data_loader, verbose=1)
+        print(f"Saving result to {args.output} ...")
+        with open(filepath, "w") as openfile:
+            openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+            progbar = tqdm(total=test_dataset.total_steps, unit="batch")
+            for i, pred in enumerate(results):
+                groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred]
+                path, duration, _ = test_dataset.entries[i]
+                openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+                progbar.update(1)
+            progbar.close()
+    app_util.evaluate_results(filepath)
diff --git a/examples/jasper/tflite.py b/examples/jasper/tflite.py
new file mode 100644
index 0000000000..962118e165
--- /dev/null
+++ b/examples/jasper/tflite.py
@@ -0,0 +1,69 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+from tensorflow_asr.utils import env_util, file_util
+
+env_util.setup_environment()
+import tensorflow as tf
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
+from tensorflow_asr.models.ctc.jasper import Jasper
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Jasper TFLite")
+
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
+
+parser.add_argument("--subwords", type=str, default=None, help="Use subwords")
+
+parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
+
+args = parser.parse_args()
+
+assert args.saved and args.output
+
+config = Config(args.config)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+
+if args.subwords:
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
+else:
+    text_featurizer = CharFeaturizer(config.decoder_config)
+
+# build model
+jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+jasper._build(speech_featurizer.shape)
+jasper.load_weights(args.saved)
+jasper.summary(line_length=100)
+jasper.add_featurizers(speech_featurizer, text_featurizer)
+
+concrete_func = jasper.make_tflite_function().get_concrete_function()
+converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+converter.experimental_new_converter = True
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+
+args.output = file_util.preprocess_paths(args.output)
+with open(args.output, "wb") as tflite_out:
+    tflite_out.write(tflite_model)
diff --git a/examples/jasper/train.py b/examples/jasper/train.py
index 444ca1314a..f27d63c066 100644
--- a/examples/jasper/train.py
+++ b/examples/jasper/train.py
@@ -14,9 +14,9 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
+from tensorflow_asr.utils import env_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
@@ -25,19 +25,23 @@
 
 parser = argparse.ArgumentParser(prog="Jasper Training")
 
-parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
+parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
-parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replicas")
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replicas")
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
+parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
+
+parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
 parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
 
-parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords dataset")
+parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths")
 
 parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
@@ -47,57 +51,71 @@
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-strategy = setup_strategy(args.devices)
+strategy = env_util.setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
-from tensorflow_asr.models.keras.jasper import Jasper
+from tensorflow_asr.datasets import asr_dataset
+from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
+from tensorflow_asr.models.ctc.jasper import Jasper
 
 config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-text_featurizer = CharFeaturizer(config.decoder_config)
+speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
+
+if args.sentence_piece:
+    print("Loading SentencePiece model ...")
+    text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Loading subwords ...")
+    text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config)
+else:
+    print("Use characters ...")
+    text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
-    train_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
+    eval_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
+        indefinite=True
     )
-    # Update metadata calculated from both train and eval datasets
-    train_dataset.load_metadata(args.metadata_prefix)
-    eval_dataset.load_metadata(args.metadata_prefix)
-    # Use dynamic length
-    speech_featurizer.reset_length()
-    text_featurizer.reset_length()
 else:
-    train_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    eval_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
 
-global_batch_size = config.learning_config.running_config.batch_size
+train_dataset.load_metadata(args.metadata)
+eval_dataset.load_metadata(args.metadata)
+
+if not args.static_length:
+    speech_featurizer.reset_length()
+    text_featurizer.reset_length()
+
+global_batch_size = args.tbs or config.learning_config.running_config.batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
 eval_data_loader = eval_dataset.create(global_batch_size)
 
 with strategy.scope():
+    # build model
     jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
     jasper._build(speech_featurizer.shape)
-    jasper.summary(line_length=120)
-
+    jasper.summary(line_length=100)
     jasper.compile(
         optimizer=config.learning_config.optimizer_config,
         experimental_steps_per_execution=args.spx,
@@ -112,7 +130,10 @@
 ]
 
 jasper.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+    train_data_loader,
+    epochs=config.learning_config.running_config.num_epochs,
+    validation_data=eval_data_loader,
+    callbacks=callbacks,
+    steps_per_epoch=train_dataset.total_steps,
+    validation_steps=eval_dataset.total_steps
 )
diff --git a/examples/rnn_transducer/test.py b/examples/rnn_transducer/test.py
index 377ef291a0..724924ce40 100644
--- a/examples/rnn_transducer/test.py
+++ b/examples/rnn_transducer/test.py
@@ -13,17 +13,18 @@
 # limitations under the License.
 
 import os
+from tqdm import tqdm
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_devices
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Conformer Testing")
+parser = argparse.ArgumentParser(prog="RnnTransducer Testing")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
@@ -33,63 +34,85 @@
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
+parser.add_argument("--bs", type=int, default=None, help="Test batch size")
+
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--device", type=int, default=0, help="Device's id to run test on")
 
 parser.add_argument("--cpu", default=False, action="store_true", help="Whether to only use cpu")
 
-parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
-
-parser.add_argument("--output_name", type=str, default="test", help="Result filename name prefix")
+parser.add_argument("--output", type=str, default="test.tsv", help="Result filepath")
 
 args = parser.parse_args()
 
+assert args.saved
+
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-setup_devices([args.device], cpu=args.cpu)
+env_util.setup_devices([args.device], cpu=args.cpu)
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.runners.base_runners import BaseTester
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer
+from tensorflow_asr.utils import app_util
 
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.sentence_piece:
+    print("Use SentencePiece ...")
+    text_featurizer = SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Use subwords ...")
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    print("Use characters ...")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 tf.random.set_seed(0)
-assert args.saved
 
 if args.tfrecords:
     test_dataset = ASRTFRecordDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 else:
     test_dataset = ASRSliceDataset(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.test_dataset_config)
     )
 
 # build model
-streaming_transducer = StreamingTransducer(
-    vocabulary_size=text_featurizer.num_classes,
-    **config.model_config
-)
-streaming_transducer._build(speech_featurizer.shape)
-streaming_transducer.load_weights(args.saved)
-streaming_transducer.summary(line_length=150)
-streaming_transducer.add_featurizers(speech_featurizer, text_featurizer)
-
-streaming_transducer_tester = BaseTester(
-    config=config.learning_config.running_config,
-    output_name=args.output_name
-)
-streaming_transducer_tester.compile(streaming_transducer)
-streaming_transducer_tester.run(test_dataset)
+rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+rnn_transducer._build(speech_featurizer.shape)
+rnn_transducer.load_weights(args.saved)
+rnn_transducer.summary(line_length=100)
+rnn_transducer.add_featurizers(speech_featurizer, text_featurizer)
+
+batch_size = args.bs or config.learning_config.running_config.batch_size
+test_data_loader = test_dataset.create(batch_size)
+
+with file_util.save_file(file_util.preprocess_paths(args.output)) as filepath:
+    overwrite = False
+    if tf.io.gfile.exists(filepath):
+        overwrite = input("Overwrite existing result file? (y/n): ").lower() == "y"
+    if overwrite:
+        results = rnn_transducer.predict(test_data_loader, verbose=1)
+        print(f"Saving result to {args.output} ...")
+        with open(filepath, "w") as openfile:
+            openfile.write("PATH\tDURATION\tGROUNDTRUTH\tGREEDY\tBEAMSEARCH\n")
+            progbar = tqdm(total=test_dataset.total_steps, unit="batch")
+            for i, pred in enumerate(results):
+                groundtruth, greedy, beamsearch = [x.decode('utf-8') for x in pred]
+                path, duration, _ = test_dataset.entries[i]
+                openfile.write(f"{path}\t{duration}\t{groundtruth}\t{greedy}\t{beamsearch}\n")
+                progbar.update(1)
+            progbar.close()
+    app_util.evaluate_results(filepath)
diff --git a/examples/rnn_transducer/tflite.py b/examples/rnn_transducer/tflite.py
index 254e56de8b..1d2092029d 100644
--- a/examples/rnn_transducer/tflite.py
+++ b/examples/rnn_transducer/tflite.py
@@ -14,33 +14,29 @@
 
 import os
 import argparse
-from tensorflow_asr.utils import setup_environment
+from tensorflow_asr.utils import env_util, file_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
-from tensorflow_asr.models.streaming_transducer import StreamingTransducer
+from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
+from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Conformer Testing")
+parser = argparse.ArgumentParser(prog="RnnTransducer TFLite")
 
-parser.add_argument("--config", type=str, default=DEFAULT_YAML,
-                    help="The file path of model configuration file")
+parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--saved", type=str, default=None,
-                    help="Path to saved model")
+parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
-parser.add_argument("--subwords", type=str, default=None,
-                    help="Path to file that stores generated subwords")
+parser.add_argument("--subwords", type=str, default=None, help="Use subwords")
 
-parser.add_argument("output", type=str, default=None,
-                    help="TFLite file path to be exported")
+parser.add_argument("output", type=str, default=None, help="TFLite file path to be exported")
 
 args = parser.parse_args()
 
@@ -49,30 +45,25 @@
 config = Config(args.config)
 speech_featurizer = TFSpeechFeaturizer(config.speech_config)
 
-if args.subwords and os.path.exists(args.subwords):
-    print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+if args.subwords:
+    text_featurizer = SubwordFeaturizer(config.decoder_config)
 else:
-    raise ValueError("subwords must be set")
+    text_featurizer = CharFeaturizer(config.decoder_config)
 
 # build model
-streaming_transducer = StreamingTransducer(
-    **config.model_config,
-    vocabulary_size=text_featurizer.num_classes
-)
-streaming_transducer._build(speech_featurizer.shape)
-streaming_transducer.load_weights(args.saved)
-streaming_transducer.summary(line_length=150)
-streaming_transducer.add_featurizers(speech_featurizer, text_featurizer)
-
-concrete_func = streaming_transducer.make_tflite_function().get_concrete_function()
+rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+rnn_transducer._build(speech_featurizer.shape)
+rnn_transducer.load_weights(args.saved)
+rnn_transducer.summary(line_length=100)
+rnn_transducer.add_featurizers(speech_featurizer, text_featurizer)
+
+concrete_func = rnn_transducer.make_tflite_function().get_concrete_function()
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+converter.experimental_new_converter = True
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
 tflite_model = converter.convert()
 
-if not os.path.exists(os.path.dirname(args.output)):
-    os.makedirs(os.path.dirname(args.output))
+args.output = file_util.preprocess_paths(args.output)
 with open(args.output, "wb") as tflite_out:
     tflite_out.write(tflite_model)
diff --git a/examples/rnn_transducer/train.py b/examples/rnn_transducer/train.py
index 6f7c92c643..a35f7f2801 100644
--- a/examples/rnn_transducer/train.py
+++ b/examples/rnn_transducer/train.py
@@ -13,89 +13,101 @@
 # limitations under the License.
 
 import os
+import math
 import argparse
-from tensorflow_asr.utils import setup_environment, setup_strategy
+from tensorflow_asr.utils import env_util
 
-setup_environment()
+env_util.setup_environment()
 import tensorflow as tf
 
 DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
 
 tf.keras.backend.clear_session()
 
-parser = argparse.ArgumentParser(prog="Conformer Training")
+parser = argparse.ArgumentParser(prog="RnnTransducer Training")
 
 parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file")
 
-parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep")
-
 parser.add_argument("--tfrecords", default=False, action="store_true", help="Whether to use tfrecords")
 
+parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
+
+parser.add_argument("--subwords", default=False, action="store_true", help="Use subwords")
+
 parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica")
 
 parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica")
 
 parser.add_argument("--spx", type=int, default=1, help="Steps per execution for maximizing performance")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
+
+parser.add_argument("--static_length", default=False, action="store_true", help="Use static lengths")
 
 parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training")
 
 parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision")
 
-parser.add_argument("--subword", default=False, action="store_true", help="Use subword")
-
 args = parser.parse_args()
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
 
-strategy = setup_strategy(args.devices)
+strategy = env_util.setup_strategy(args.devices)
 
 from tensorflow_asr.configs.config import Config
-from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras, ASRSliceDatasetKeras
-from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, CharFeaturizer
-from tensorflow_asr.models.keras.streaming_transducer import StreamingTransducer
+from tensorflow_asr.datasets import asr_dataset
+from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
+from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer
+from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
 config = Config(args.config)
-speech_featurizer = TFSpeechFeaturizer(config.speech_config)
-
-if args.subword:
-    print("Use subwords ...")
-    text_featurizer = SubwordFeaturizer(config.decoder_config)
+speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
+
+if args.sentence_piece:
+    print("Loading SentencePiece model ...")
+    text_featurizer = text_featurizers.SentencePieceFeaturizer(config.decoder_config)
+elif args.subwords:
+    print("Loading subwords ...")
+    text_featurizer = text_featurizers.SubwordFeaturizer(config.decoder_config)
 else:
     print("Use characters ...")
-    text_featurizer = CharFeaturizer(config.decoder_config)
+    text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)
 
 if args.tfrecords:
-    train_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRTFRecordDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-        **vars(config.learning_config.eval_dataset_config)
+    eval_dataset = asr_dataset.ASRTFRecordDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
+        indefinite=True
     )
-    # Update metadata calculated from both train and eval datasets
-    train_dataset.load_metadata(args.metadata_prefix)
-    eval_dataset.load_metadata(args.metadata_prefix)
-    # Use dynamic length
-    speech_featurizer.reset_length()
-    text_featurizer.reset_length()
 else:
-    train_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    train_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.train_dataset_config),
         indefinite=True
     )
-    eval_dataset = ASRSliceDatasetKeras(
-        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+    eval_dataset = asr_dataset.ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
         **vars(config.learning_config.eval_dataset_config),
         indefinite=True
     )
 
-global_batch_size = config.learning_config.running_config.batch_size
+train_dataset.load_metadata(args.metadata)
+eval_dataset.load_metadata(args.metadata)
+
+if not args.static_length:
+    speech_featurizer.reset_length()
+    text_featurizer.reset_length()
+
+global_batch_size = args.tbs or config.learning_config.running_config.batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
@@ -103,16 +115,20 @@
 
 with strategy.scope():
     # build model
-    streaming_transducer = StreamingTransducer(
-        **config.model_config,
-        vocabulary_size=text_featurizer.num_classes
+    rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    rnn_transducer._build(speech_featurizer.shape)
+    rnn_transducer.summary(line_length=100)
+
+    optimizer = tf.keras.optimizers.Adam(
+        TransformerSchedule(
+            d_model=rnn_transducer.dmodel,
+            warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
+            max_lr=(0.05 / math.sqrt(rnn_transducer.dmodel))
+        ),
+        **config.learning_config.optimizer_config
     )
-    streaming_transducer._build(speech_featurizer.shape)
-    streaming_transducer.summary(line_length=150)
-
-    optimizer = tf.keras.optimizers.get(config.learning_config.optimizer_config)
 
-    streaming_transducer.compile(
+    rnn_transducer.compile(
         optimizer=optimizer,
         experimental_steps_per_execution=args.spx,
         global_batch_size=global_batch_size,
@@ -125,8 +141,11 @@
     tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)
 ]
 
-streaming_transducer.fit(
-    train_data_loader, epochs=config.learning_config.running_config.num_epochs,
-    validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+rnn_transducer.fit(
+    train_data_loader,
+    epochs=config.learning_config.running_config.num_epochs,
+    validation_data=eval_data_loader,
+    callbacks=callbacks,
+    steps_per_epoch=train_dataset.total_steps,
+    validation_steps=eval_dataset.total_steps
 )

From d0284895bfe33a21edc2fa0d256423f236e3c9eb Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 17 Apr 2021 14:01:30 +0700
Subject: [PATCH 11/13] :writing_hand: update contextnet and init notebook
 examples

---
 README.md                                     |  1 +
 examples/contextnet/config.yml                | 29 +++++++------------
 notebooks/conformer.ipynb                     |  0
 notebooks/contextnet.ipynb                    |  0
 notebooks/deepspeech2.ipynb                   |  0
 notebooks/jasper.ipynb                        |  0
 tensorflow_asr/models/base_model.py           |  8 +++--
 tensorflow_asr/models/ctc/ctc.py              |  4 ---
 .../models/transducer/contextnet.py           | 14 +++++----
 9 files changed, 26 insertions(+), 30 deletions(-)
 create mode 100644 notebooks/conformer.ipynb
 create mode 100644 notebooks/contextnet.ipynb
 create mode 100644 notebooks/deepspeech2.ipynb
 create mode 100644 notebooks/jasper.ipynb

diff --git a/README.md b/README.md
index df80c93f7e..36da89c78b 100755
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as
 
 ## What's New?
 
+- (04/17/2021) Refactor repository with new version 1.x
 - (02/16/2021) Supported for TPU training
 - (12/27/2020) Supported _naive_ token level timestamp, see [demo](./examples/demonstration/conformer.py) with flag `--timestamp`
 - (12/17/2020) Supported ContextNet [http://arxiv.org/abs/2005.03191](http://arxiv.org/abs/2005.03191)
diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml
index 24b2f17e9d..790c0e5a38 100644
--- a/examples/contextnet/config.yml
+++ b/examples/contextnet/config.yml
@@ -207,8 +207,8 @@ learning_config:
           num_masks: 1
           mask_factor: 27
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
     buffer_size: 100
@@ -217,10 +217,8 @@ learning_config:
 
   eval_dataset_config:
     use_tf: True
-    data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    data_paths: null
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -230,8 +228,8 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/e/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -240,26 +238,21 @@ learning_config:
 
   optimizer_config:
     warmup_steps: 40000
-    beta1: 0.9
-    beta2: 0.98
+    beta_1: 0.9
+    beta_2: 0.98
     epsilon: 1e-9
 
   running_config:
     batch_size: 2
-    accumulation_steps: 4
     num_epochs: 20
-    outdir: /mnt/Miscellanea/Models/local/contextnet
-    log_interval_steps: 300
-    eval_interval_steps: 500
-    save_interval_steps: 1000
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/contextnet/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5
       save_best_only: True
       save_weights_only: False
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/contextnet/states
+    states_dir: /mnt/e/Models/local/contextnet/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/contextnet/tensorboard
+      log_dir: /mnt/e/Models/local/contextnet/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
diff --git a/notebooks/conformer.ipynb b/notebooks/conformer.ipynb
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/notebooks/contextnet.ipynb b/notebooks/contextnet.ipynb
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/notebooks/deepspeech2.ipynb b/notebooks/deepspeech2.ipynb
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/notebooks/jasper.ipynb b/notebooks/jasper.ipynb
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
index 1ebf8787e5..39ddbe4693 100644
--- a/tensorflow_asr/models/base_model.py
+++ b/tensorflow_asr/models/base_model.py
@@ -95,10 +95,12 @@ def train_step(self, batch):
             y_pred = self(inputs, training=True)
             loss = self.loss(y_true, y_pred)
             if self.use_loss_scale:
-                loss = self.optimizer.get_scaled_loss(loss)
-        gradients = tape.gradient(loss, self.trainable_weights)
+                scaled_loss = self.optimizer.get_scaled_loss(loss)
         if self.use_loss_scale:
+            gradients = tape.gradient(scaled_loss, self.trainable_weights)
             gradients = self.optimizer.get_unscaled_gradients(gradients)
+        else:
+            gradients = tape.gradient(loss, self.trainable_weights)
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
         self._metrics["loss"].update_state(loss)
         return {m.name: m.result() for m in self.metrics}
@@ -127,6 +129,8 @@ def predict_step(self, batch):
             beam_search_decoding = self.recognize_beam(inputs)
         return tf.stack([labels, greedy_decoding, beam_search_decoding], axis=-1)
 
+    # -------------------------------- INFERENCE FUNCTIONS -------------------------------------
+
     def recognize(self, features, input_lengths, **kwargs):
         pass
 
diff --git a/tensorflow_asr/models/ctc/ctc.py b/tensorflow_asr/models/ctc/ctc.py
index a30c0e166e..0166e3ead7 100644
--- a/tensorflow_asr/models/ctc/ctc.py
+++ b/tensorflow_asr/models/ctc/ctc.py
@@ -38,10 +38,6 @@ def __init__(self,
             self.decoder = decoder
         self.time_reduction_factor = 1
 
-    @property
-    def metrics(self):
-        return [self.loss_metric]
-
     def _build(self, input_shape, batch_size=None):
         inputs = tf.keras.Input(input_shape, batch_size=batch_size, dtype=tf.float32)
         inputs_length = tf.keras.Input(shape=[], batch_size=batch_size, dtype=tf.int32)
diff --git a/tensorflow_asr/models/transducer/contextnet.py b/tensorflow_asr/models/transducer/contextnet.py
index bc81f17fe7..134a06a8f2 100644
--- a/tensorflow_asr/models/transducer/contextnet.py
+++ b/tensorflow_asr/models/transducer/contextnet.py
@@ -17,7 +17,7 @@
 
 from ..encoders.contextnet import ContextNetEncoder, L2
 from .transducer import Transducer
-from ...utils import math_util
+from ...utils import math_util, data_util
 
 
 class ContextNet(Transducer):
@@ -80,11 +80,13 @@ def __init__(self,
         for block in self.encoder.blocks: self.time_reduction_factor *= block.time_reduction_factor
 
     def call(self, inputs, training=False, **kwargs):
-        features, input_length, prediction, prediction_length = inputs
-        enc = self.encoder([features, input_length], training=training, **kwargs)
-        pred = self.predict_net([prediction, prediction_length], training=training, **kwargs)
-        outputs = self.joint_net([enc, pred], training=training, **kwargs)
-        return outputs
+        enc = self.encoder([inputs["inputs"], inputs["inputs_length"]], training=training, **kwargs)
+        pred = self.predict_net([inputs["predictions"], inputs["predictions_length"]], training=training, **kwargs)
+        logits = self.joint_net([enc, pred], training=training, **kwargs)
+        return data_util.create_logits(
+            logits=logits,
+            logits_length=math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
+        )
 
     def encoder_inference(self, features: tf.Tensor, input_length: tf.Tensor):
         with tf.name_scope(f"{self.name}_encoder"):

From 4be4a6e062c744dffb3fd392d5c96a6003d6a031 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sat, 17 Apr 2021 18:07:08 +0700
Subject: [PATCH 12/13] :writing_hand: test and update train script

---
 examples/conformer/config.yml       |  2 +-
 examples/contextnet/config.yml      |  4 ++--
 examples/deepspeech2/config.yml     | 27 ++++++++++-----------------
 examples/jasper/config.yml          | 27 ++++++++++-----------------
 examples/rnn_transducer/config.yml  | 27 ++++++++++-----------------
 examples/rnn_transducer/train.py    | 14 +-------------
 tensorflow_asr/models/ctc/jasper.py |  2 +-
 tensorflow_asr/utils/data_util.py   |  9 ++++++---
 tensorflow_asr/utils/env_util.py    |  2 ++
 9 files changed, 43 insertions(+), 71 deletions(-)

diff --git a/examples/conformer/config.yml b/examples/conformer/config.yml
index 0ee6487e98..9ab5255c1f 100755
--- a/examples/conformer/config.yml
+++ b/examples/conformer/config.yml
@@ -115,7 +115,7 @@ learning_config:
     checkpoint:
       filepath: /mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5
       save_best_only: True
-      save_weights_only: False
+      save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/conformer/states
     tensorboard:
diff --git a/examples/contextnet/config.yml b/examples/contextnet/config.yml
index 790c0e5a38..c0b9f24dd1 100644
--- a/examples/contextnet/config.yml
+++ b/examples/contextnet/config.yml
@@ -228,7 +228,7 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/e/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
     tfrecords_dir: null
     shuffle: False
     cache: True
@@ -248,7 +248,7 @@ learning_config:
     checkpoint:
       filepath: /mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5
       save_best_only: True
-      save_weights_only: False
+      save_weights_only: True
       save_freq: epoch
     states_dir: /mnt/e/Models/local/contextnet/states
     tensorboard:
diff --git a/examples/deepspeech2/config.yml b/examples/deepspeech2/config.yml
index 68a77d7bd4..cbc8ad65ef 100755
--- a/examples/deepspeech2/config.yml
+++ b/examples/deepspeech2/config.yml
@@ -52,8 +52,8 @@ learning_config:
   train_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
     buffer_size: 100
@@ -62,10 +62,8 @@ learning_config:
 
   eval_dataset_config:
     use_tf: True
-    data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    data_paths: null
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -75,8 +73,8 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -91,19 +89,14 @@ learning_config:
   running_config:
     batch_size: 4
     num_epochs: 20
-    accumulation_steps: 8
-    outdir: /mnt/Miscellanea/Models/local/deepspeech2
-    log_interval_steps: 400
-    save_interval_steps: 400
-    eval_interval_steps: 800
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/deepspeech2/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/deepspeech2/checkpoints/{epoch:02d}.h5
       save_best_only: True
-      save_weights_only: False
+      save_weights_only: True
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/deepspeech2/states
+    states_dir: /mnt/e/Models/local/deepspeech2/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/deepspeech2/tensorboard
+      log_dir: /mnt/e/Models/local/deepspeech2/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
diff --git a/examples/jasper/config.yml b/examples/jasper/config.yml
index f6c158edce..0b16fdec89 100755
--- a/examples/jasper/config.yml
+++ b/examples/jasper/config.yml
@@ -59,8 +59,8 @@ learning_config:
   train_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
     buffer_size: 100
@@ -69,10 +69,8 @@ learning_config:
 
   eval_dataset_config:
     use_tf: True
-    data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    data_paths: null
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -82,8 +80,8 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -98,19 +96,14 @@ learning_config:
   running_config:
     batch_size: 4
     num_epochs: 20
-    accumulation_steps: 8
-    outdir: /mnt/Miscellanea/Models/local/jasper
-    log_interval_steps: 400
-    save_interval_steps: 400
-    eval_interval_steps: 800
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/jasper/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/jasper/checkpoints/{epoch:02d}.h5
       save_best_only: True
-      save_weights_only: False
+      save_weights_only: True
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/jasper/states
+    states_dir: /mnt/e/Models/local/jasper/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/jasper/tensorboard
+      log_dir: /mnt/e/Models/local/jasper/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
diff --git a/examples/rnn_transducer/config.yml b/examples/rnn_transducer/config.yml
index 8acfee4f92..4efbb11024 100644
--- a/examples/rnn_transducer/config.yml
+++ b/examples/rnn_transducer/config.yml
@@ -64,8 +64,8 @@ learning_config:
           num_masks: 1
           mask_factor: 27
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/train-clean-100/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv
+    tfrecords_dir: null
     shuffle: True
     cache: True
     buffer_size: 100
@@ -74,10 +74,8 @@ learning_config:
 
   eval_dataset_config:
     use_tf: True
-    data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-clean/transcripts.tsv
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/dev-other/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+    data_paths: null
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -87,8 +85,8 @@ learning_config:
   test_dataset_config:
     use_tf: True
     data_paths:
-      - /mnt/Miscellanea/Datasets/Speech/LibriSpeech/test-clean/transcripts.tsv
-    tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
+      - /mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv
+    tfrecords_dir: null
     shuffle: False
     cache: True
     buffer_size: 100
@@ -102,20 +100,15 @@ learning_config:
 
   running_config:
     batch_size: 2
-    accumulation_steps: 1
     num_epochs: 20
-    outdir: /mnt/Miscellanea/Models/local/streaming_transducer
-    log_interval_steps: 300
-    eval_interval_steps: 500
-    save_interval_steps: 1000
     checkpoint:
-      filepath: /mnt/Miscellanea/Models/local/streaming_transducer/checkpoints/{epoch:02d}.h5
+      filepath: /mnt/e/Models/local/rnn_transducer/checkpoints/{epoch:02d}.h5
       save_best_only: True
-      save_weights_only: False
+      save_weights_only: True
       save_freq: epoch
-    states_dir: /mnt/Miscellanea/Models/local/streaming_transducer/states
+    states_dir: /mnt/e/Models/local/rnn_transducer/states
     tensorboard:
-      log_dir: /mnt/Miscellanea/Models/local/streaming_transducer/tensorboard
+      log_dir: /mnt/e/Models/local/rnn_transducer/tensorboard
       histogram_freq: 1
       write_graph: True
       write_images: True
diff --git a/examples/rnn_transducer/train.py b/examples/rnn_transducer/train.py
index a35f7f2801..4e3eff4ba4 100644
--- a/examples/rnn_transducer/train.py
+++ b/examples/rnn_transducer/train.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import os
-import math
 import argparse
 from tensorflow_asr.utils import env_util
 
@@ -58,7 +57,6 @@
 from tensorflow_asr.datasets import asr_dataset
 from tensorflow_asr.featurizers import speech_featurizers, text_featurizers
 from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer
-from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
 config = Config(args.config)
 speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)
@@ -118,18 +116,8 @@
     rnn_transducer = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
     rnn_transducer._build(speech_featurizer.shape)
     rnn_transducer.summary(line_length=100)
-
-    optimizer = tf.keras.optimizers.Adam(
-        TransformerSchedule(
-            d_model=rnn_transducer.dmodel,
-            warmup_steps=config.learning_config.optimizer_config.pop("warmup_steps", 10000),
-            max_lr=(0.05 / math.sqrt(rnn_transducer.dmodel))
-        ),
-        **config.learning_config.optimizer_config
-    )
-
     rnn_transducer.compile(
-        optimizer=optimizer,
+        optimizer=config.learning_config.optimizer_config,
         experimental_steps_per_execution=args.spx,
         global_batch_size=global_batch_size,
         blank=text_featurizer.blank
diff --git a/tensorflow_asr/models/ctc/jasper.py b/tensorflow_asr/models/ctc/jasper.py
index 963391a7bb..23b47ed063 100644
--- a/tensorflow_asr/models/ctc/jasper.py
+++ b/tensorflow_asr/models/ctc/jasper.py
@@ -357,7 +357,7 @@ def __init__(self,
                 strides=1, padding="same",
                 kernel_regularizer=kernel_regularizer,
                 bias_regularizer=bias_regularizer,
-                name=f"{self.name}_logits"
+                name=f"{name}_logits"
             ),
             vocabulary_size=vocabulary_size,
             name=name,
diff --git a/tensorflow_asr/utils/data_util.py b/tensorflow_asr/utils/data_util.py
index 324c720d49..2bcdca8d4e 100644
--- a/tensorflow_asr/utils/data_util.py
+++ b/tensorflow_asr/utils/data_util.py
@@ -21,12 +21,15 @@ def create_inputs(inputs: tf.Tensor,
                   inputs_length: tf.Tensor,
                   predictions: tf.Tensor = None,
                   predictions_length: tf.Tensor = None) -> dict:
-    return {
+    data = {
         "inputs": inputs,
         "inputs_length": inputs_length,
-        "predictions": predictions,
-        "predictions_length": predictions_length
     }
+    if predictions is not None:
+        data["predictions"] = predictions
+    if predictions_length is not None:
+        data["predictions_length"] = predictions_length
+    return data
 
 
 def create_logits(logits: tf.Tensor, logits_length: tf.Tensor) -> dict:
diff --git a/tensorflow_asr/utils/env_util.py b/tensorflow_asr/utils/env_util.py
index c5564b543e..8073b20eee 100644
--- a/tensorflow_asr/utils/env_util.py
+++ b/tensorflow_asr/utils/env_util.py
@@ -49,6 +49,8 @@ def setup_strategy(devices):
         tf.distribute.Strategy: MirroredStrategy for training one or multiple gpus
     """
     setup_devices(devices)
+    if has_tpu():
+        return setup_tpu()
     return tf.distribute.MirroredStrategy()
 
 

From 11d6afcb1ec42caaa5b39451b8b724514dd11837 Mon Sep 17 00:00:00 2001
From: Huy Le Nguyen <nlhuy.cs.16@gmail.com>
Date: Sun, 18 Apr 2021 00:34:02 +0700
Subject: [PATCH 13/13] :writing_hand: update dataset and add notebooks

---
 notebooks/conformer.ipynb              | 269 +++++++++++++++
 notebooks/contextnet.ipynb             | 433 +++++++++++++++++++++++++
 notebooks/deepspeech2.ipynb            |   0
 notebooks/jasper.ipynb                 |   0
 notebooks/rnn_transducer.ipynb         | 237 ++++++++++++++
 scripts/create_vocab_from_trans.py     |   6 +-
 scripts/generate_metadata.py           |   4 +-
 tensorflow_asr/configs/config.py       |   5 +-
 tensorflow_asr/datasets/asr_dataset.py |  36 +-
 9 files changed, 966 insertions(+), 24 deletions(-)
 delete mode 100644 notebooks/deepspeech2.ipynb
 delete mode 100644 notebooks/jasper.ipynb
 create mode 100644 notebooks/rnn_transducer.ipynb

diff --git a/notebooks/conformer.ipynb b/notebooks/conformer.ipynb
index e69de29bb2..911da8606f 100644
--- a/notebooks/conformer.ipynb
+++ b/notebooks/conformer.ipynb
@@ -0,0 +1,269 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f",
+   "display_name": "Python 3.8.8 64-bit ('tfo': conda)"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = {\n",
+    "    \"speech_config\": {\n",
+    "        \"sample_rate\": 16000,\n",
+    "        \"frame_ms\": 25,\n",
+    "        \"stride_ms\": 10,\n",
+    "        \"num_feature_bins\": 80,\n",
+    "        \"feature_type\": \"log_mel_spectrogram\",\n",
+    "        \"preemphasis\": 0.97,\n",
+    "        \"normalize_signal\": True,\n",
+    "        \"normalize_feature\": True,\n",
+    "        \"normalize_per_feature\": False,\n",
+    "    },\n",
+    "    \"decoder_config\": {\n",
+    "        \"vocabulary\": None,\n",
+    "        \"target_vocab_size\": 1000,\n",
+    "        \"max_subword_length\": 10,\n",
+    "        \"blank_at_zero\": True,\n",
+    "        \"beam_width\": 0,\n",
+    "        \"norm_score\": True,\n",
+    "        \"corpus_files\": None,\n",
+    "    },\n",
+    "    \"model_config\": {\n",
+    "        \"name\": \"conformer\",\n",
+    "        \"encoder_subsampling\": {\n",
+    "            \"type\": \"conv2d\",\n",
+    "            \"filters\": 144,\n",
+    "            \"kernel_size\": 3,\n",
+    "            \"strides\": 2,\n",
+    "        },\n",
+    "        \"encoder_positional_encoding\": \"sinusoid_concat\",\n",
+    "        \"encoder_dmodel\": 144,\n",
+    "        \"encoder_num_blocks\": 16,\n",
+    "        \"encoder_head_size\": 36,\n",
+    "        \"encoder_num_heads\": 4,\n",
+    "        \"encoder_mha_type\": \"relmha\",\n",
+    "        \"encoder_kernel_size\": 32,\n",
+    "        \"encoder_fc_factor\": 0.5,\n",
+    "        \"encoder_dropout\": 0.1,\n",
+    "        \"prediction_embed_dim\": 320,\n",
+    "        \"prediction_embed_dropout\": 0,\n",
+    "        \"prediction_num_rnns\": 1,\n",
+    "        \"prediction_rnn_units\": 320,\n",
+    "        \"prediction_rnn_type\": \"lstm\",\n",
+    "        \"prediction_rnn_implementation\": 2,\n",
+    "        \"prediction_layer_norm\": True,\n",
+    "        \"prediction_projection_units\": 0,\n",
+    "        \"joint_dim\": 320,\n",
+    "        \"prejoint_linear\": True,\n",
+    "        \"joint_activation\": \"tanh\",\n",
+    "        \"joint_mode\": \"add\",\n",
+    "    },\n",
+    "    \"learning_config\": {\n",
+    "        \"train_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"augmentation_config\": {\n",
+    "                \"feature_augment\": {\n",
+    "                    \"time_masking\": {\n",
+    "                        \"num_masks\": 10,\n",
+    "                        \"mask_factor\": 100,\n",
+    "                        \"p_upperbound\": 0.05,\n",
+    "                    },\n",
+    "                    \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n",
+    "                }\n",
+    "            },\n",
+    "            \"data_paths\": [\n",
+    "                \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n",
+    "            ],\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": True,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"train\",\n",
+    "        },\n",
+    "        \"eval_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": None,\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"eval\",\n",
+    "        },\n",
+    "        \"test_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": None,\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"test\",\n",
+    "        },\n",
+    "        \"optimizer_config\": {\n",
+    "            \"warmup_steps\": 40000,\n",
+    "            \"beta_1\": 0.9,\n",
+    "            \"beta_2\": 0.98,\n",
+    "            \"epsilon\": 1e-09,\n",
+    "        },\n",
+    "        \"running_config\": {\n",
+    "            \"batch_size\": 2,\n",
+    "            \"num_epochs\": 50,\n",
+    "            \"checkpoint\": {\n",
+    "                \"filepath\": \"/mnt/e/Models/local/conformer/checkpoints/{epoch:02d}.h5\",\n",
+    "                \"save_best_only\": True,\n",
+    "                \"save_weights_only\": True,\n",
+    "                \"save_freq\": \"epoch\",\n",
+    "            },\n",
+    "            \"states_dir\": \"/mnt/e/Models/local/conformer/states\",\n",
+    "            \"tensorboard\": {\n",
+    "                \"log_dir\": \"/mnt/e/Models/local/conformer/tensorboard\",\n",
+    "                \"histogram_freq\": 1,\n",
+    "                \"write_graph\": True,\n",
+    "                \"write_images\": True,\n",
+    "                \"update_freq\": \"epoch\",\n",
+    "                \"profile_batch\": 2,\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata = {\n",
+    "    \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n",
+    "    \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "import argparse\n",
+    "from tensorflow_asr.utils import env_util\n",
+    "\n",
+    "env_util.setup_environment()\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n",
+    "strategy = env_util.setup_strategy([0])\n",
+    "\n",
+    "from tensorflow_asr.configs.config import Config\n",
+    "from tensorflow_asr.datasets import asr_dataset\n",
+    "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n",
+    "from tensorflow_asr.models.transducer.conformer import Conformer\n",
+    "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n",
+    "\n",
+    "config = Config(config)\n",
+    "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n",
+    "\n",
+    "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n",
+    "\n",
+    "train_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.train_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "eval_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.eval_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "\n",
+    "train_dataset.load_metadata(metadata)\n",
+    "eval_dataset.load_metadata(metadata)\n",
+    "speech_featurizer.reset_length()\n",
+    "text_featurizer.reset_length()\n",
+    "\n",
+    "global_batch_size = config.learning_config.running_config.batch_size\n",
+    "global_batch_size *= strategy.num_replicas_in_sync\n",
+    "\n",
+    "train_data_loader = train_dataset.create(global_batch_size)\n",
+    "eval_data_loader = eval_dataset.create(global_batch_size)\n",
+    "\n",
+    "with strategy.scope():\n",
+    "    # build model\n",
+    "    conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n",
+    "    conformer._build(speech_featurizer.shape)\n",
+    "    conformer.summary(line_length=100)\n",
+    "\n",
+    "    optimizer = tf.keras.optimizers.Adam(\n",
+    "        TransformerSchedule(\n",
+    "            d_model=conformer.dmodel,\n",
+    "            warmup_steps=config.learning_config.optimizer_config.pop(\"warmup_steps\", 10000),\n",
+    "            max_lr=(0.05 / math.sqrt(conformer.dmodel))\n",
+    "        ),\n",
+    "        **config.learning_config.optimizer_config\n",
+    "    )\n",
+    "\n",
+    "    conformer.compile(\n",
+    "        optimizer=optimizer,\n",
+    "        experimental_steps_per_execution=10,\n",
+    "        global_batch_size=global_batch_size,\n",
+    "        blank=text_featurizer.blank\n",
+    "    )\n",
+    "\n",
+    "callbacks = [\n",
+    "    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n",
+    "    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n",
+    "    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n",
+    "]\n",
+    "\n",
+    "conformer.fit(\n",
+    "    train_data_loader,\n",
+    "    epochs=config.learning_config.running_config.num_epochs,\n",
+    "    validation_data=eval_data_loader,\n",
+    "    callbacks=callbacks,\n",
+    "    steps_per_epoch=train_dataset.total_steps,\n",
+    "    validation_steps=eval_dataset.total_steps\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/contextnet.ipynb b/notebooks/contextnet.ipynb
index e69de29bb2..22efd1ca29 100644
--- a/notebooks/contextnet.ipynb
+++ b/notebooks/contextnet.ipynb
@@ -0,0 +1,433 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f",
+   "display_name": "Python 3.8.8 64-bit ('tfo': conda)"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = {\n",
+    "    \"speech_config\": {\n",
+    "        \"sample_rate\": 16000,\n",
+    "        \"frame_ms\": 25,\n",
+    "        \"stride_ms\": 10,\n",
+    "        \"num_feature_bins\": 80,\n",
+    "        \"feature_type\": \"log_mel_spectrogram\",\n",
+    "        \"preemphasis\": 0.97,\n",
+    "        \"normalize_signal\": True,\n",
+    "        \"normalize_feature\": True,\n",
+    "        \"normalize_per_feature\": False,\n",
+    "    },\n",
+    "    \"decoder_config\": {\n",
+    "        \"vocabulary\": None,\n",
+    "        \"target_vocab_size\": 1024,\n",
+    "        \"max_subword_length\": 4,\n",
+    "        \"blank_at_zero\": True,\n",
+    "        \"beam_width\": 5,\n",
+    "        \"norm_score\": True,\n",
+    "    },\n",
+    "    \"model_config\": {\n",
+    "        \"name\": \"contextnet\",\n",
+    "        \"encoder_alpha\": 0.5,\n",
+    "        \"encoder_blocks\": [\n",
+    "            {\n",
+    "                \"nlayers\": 1,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": False,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 2,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 2,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 256,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 2,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 5,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 512,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": True,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"nlayers\": 1,\n",
+    "                \"kernel_size\": 5,\n",
+    "                \"filters\": 640,\n",
+    "                \"strides\": 1,\n",
+    "                \"residual\": False,\n",
+    "                \"activation\": \"silu\",\n",
+    "            },\n",
+    "        ],\n",
+    "        \"prediction_embed_dim\": 640,\n",
+    "        \"prediction_embed_dropout\": 0,\n",
+    "        \"prediction_num_rnns\": 1,\n",
+    "        \"prediction_rnn_units\": 640,\n",
+    "        \"prediction_rnn_type\": \"lstm\",\n",
+    "        \"prediction_rnn_implementation\": 1,\n",
+    "        \"prediction_layer_norm\": True,\n",
+    "        \"prediction_projection_units\": 0,\n",
+    "        \"joint_dim\": 640,\n",
+    "        \"joint_activation\": \"tanh\",\n",
+    "    },\n",
+    "    \"learning_config\": {\n",
+    "        \"train_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"augmentation_config\": {\n",
+    "                \"feature_augment\": {\n",
+    "                    \"time_masking\": {\n",
+    "                        \"num_masks\": 10,\n",
+    "                        \"mask_factor\": 100,\n",
+    "                        \"p_upperbound\": 0.05,\n",
+    "                    },\n",
+    "                    \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n",
+    "                }\n",
+    "            },\n",
+    "            \"data_paths\": [\n",
+    "                \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n",
+    "            ],\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": True,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"train\",\n",
+    "        },\n",
+    "        \"eval_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": None,\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"eval\",\n",
+    "        },\n",
+    "        \"test_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": [\n",
+    "                \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv\"\n",
+    "            ],\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"test\",\n",
+    "        },\n",
+    "        \"optimizer_config\": {\n",
+    "            \"warmup_steps\": 40000,\n",
+    "            \"beta_1\": 0.9,\n",
+    "            \"beta_2\": 0.98,\n",
+    "            \"epsilon\": 1e-09,\n",
+    "        },\n",
+    "        \"running_config\": {\n",
+    "            \"batch_size\": 2,\n",
+    "            \"num_epochs\": 20,\n",
+    "            \"checkpoint\": {\n",
+    "                \"filepath\": \"/mnt/e/Models/local/contextnet/checkpoints/{epoch:02d}.h5\",\n",
+    "                \"save_best_only\": True,\n",
+    "                \"save_weights_only\": True,\n",
+    "                \"save_freq\": \"epoch\",\n",
+    "            },\n",
+    "            \"states_dir\": \"/mnt/e/Models/local/contextnet/states\",\n",
+    "            \"tensorboard\": {\n",
+    "                \"log_dir\": \"/mnt/e/Models/local/contextnet/tensorboard\",\n",
+    "                \"histogram_freq\": 1,\n",
+    "                \"write_graph\": True,\n",
+    "                \"write_images\": True,\n",
+    "                \"update_freq\": \"epoch\",\n",
+    "                \"profile_batch\": 2,\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata = {\n",
+    "    \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n",
+    "    \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "import argparse\n",
+    "from tensorflow_asr.utils import env_util\n",
+    "\n",
+    "env_util.setup_environment()\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n",
+    "strategy = env_util.setup_strategy([0])\n",
+    "\n",
+    "from tensorflow_asr.configs.config import Config\n",
+    "from tensorflow_asr.datasets import asr_dataset\n",
+    "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n",
+    "from tensorflow_asr.models.transducer.contextnet import ContextNet\n",
+    "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n",
+    "\n",
+    "config = Config(config)\n",
+    "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n",
+    "\n",
+    "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n",
+    "\n",
+    "train_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.train_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "eval_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.eval_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "\n",
+    "train_dataset.load_metadata(metadata)\n",
+    "eval_dataset.load_metadata(metadata)\n",
+    "speech_featurizer.reset_length()\n",
+    "text_featurizer.reset_length()\n",
+    "\n",
+    "global_batch_size = config.learning_config.running_config.batch_size\n",
+    "global_batch_size *= strategy.num_replicas_in_sync\n",
+    "\n",
+    "train_data_loader = train_dataset.create(global_batch_size)\n",
+    "eval_data_loader = eval_dataset.create(global_batch_size)\n",
+    "\n",
+    "with strategy.scope():\n",
+    "    # build model\n",
+    "    contextnet = ContextNet(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n",
+    "    contextnet._build(speech_featurizer.shape)\n",
+    "    contextnet.summary(line_length=100)\n",
+    "\n",
+    "    optimizer = tf.keras.optimizers.Adam(\n",
+    "        TransformerSchedule(\n",
+    "            d_model=contextnet.dmodel,\n",
+    "            warmup_steps=config.learning_config.optimizer_config.pop(\"warmup_steps\", 10000),\n",
+    "            max_lr=(0.05 / math.sqrt(contextnet.dmodel))\n",
+    "        ),\n",
+    "        **config.learning_config.optimizer_config\n",
+    "    )\n",
+    "\n",
+    "    contextnet.compile(\n",
+    "        optimizer=optimizer,\n",
+    "        experimental_steps_per_execution=10,\n",
+    "        global_batch_size=global_batch_size,\n",
+    "        blank=text_featurizer.blank\n",
+    "    )\n",
+    "\n",
+    "callbacks = [\n",
+    "    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n",
+    "    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n",
+    "    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n",
+    "]\n",
+    "\n",
+    "contextnet.fit(\n",
+    "    train_data_loader,\n",
+    "    epochs=config.learning_config.running_config.num_epochs,\n",
+    "    validation_data=eval_data_loader,\n",
+    "    callbacks=callbacks,\n",
+    "    steps_per_epoch=train_dataset.total_steps,\n",
+    "    validation_steps=eval_dataset.total_steps\n",
+    ")"
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/deepspeech2.ipynb b/notebooks/deepspeech2.ipynb
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/notebooks/jasper.ipynb b/notebooks/jasper.ipynb
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/notebooks/rnn_transducer.ipynb b/notebooks/rnn_transducer.ipynb
new file mode 100644
index 0000000000..efa97dc3fd
--- /dev/null
+++ b/notebooks/rnn_transducer.ipynb
@@ -0,0 +1,237 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python388jvsc74a57bd045f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f",
+   "display_name": "Python 3.8.8 64-bit ('tfo': conda)"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "45f983f364f7a4cc7101e6d6987a2125bf0c2b5c5c9855ff35103689f542d13f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = {\n",
+    "    \"speech_config\": {\n",
+    "        \"sample_rate\": 16000,\n",
+    "        \"frame_ms\": 25,\n",
+    "        \"stride_ms\": 10,\n",
+    "        \"num_feature_bins\": 80,\n",
+    "        \"feature_type\": \"log_mel_spectrogram\",\n",
+    "        \"preemphasis\": 0.97,\n",
+    "        \"normalize_signal\": True,\n",
+    "        \"normalize_feature\": True,\n",
+    "        \"normalize_per_feature\": False,\n",
+    "    },\n",
+    "    \"decoder_config\": {\n",
+    "        \"vocabulary\": None,\n",
+    "        \"target_vocab_size\": 1024,\n",
+    "        \"max_subword_length\": 4,\n",
+    "        \"blank_at_zero\": True,\n",
+    "        \"beam_width\": 5,\n",
+    "        \"norm_score\": True,\n",
+    "    },\n",
+    "    \"model_config\": {\n",
+    "        \"name\": \"streaming_transducer\",\n",
+    "        \"encoder_reductions\": {0: 3, 1: 2},\n",
+    "        \"encoder_dmodel\": 320,\n",
+    "        \"encoder_rnn_type\": \"lstm\",\n",
+    "        \"encoder_rnn_units\": 1024,\n",
+    "        \"encoder_nlayers\": 8,\n",
+    "        \"encoder_layer_norm\": True,\n",
+    "        \"prediction_embed_dim\": 320,\n",
+    "        \"prediction_embed_dropout\": 0.0,\n",
+    "        \"prediction_num_rnns\": 2,\n",
+    "        \"prediction_rnn_units\": 1024,\n",
+    "        \"prediction_rnn_type\": \"lstm\",\n",
+    "        \"prediction_projection_units\": 320,\n",
+    "        \"prediction_layer_norm\": True,\n",
+    "        \"joint_dim\": 320,\n",
+    "        \"joint_activation\": \"tanh\",\n",
+    "    },\n",
+    "    \"learning_config\": {\n",
+    "        \"train_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"augmentation_config\": {\n",
+    "                \"feature_augment\": {\n",
+    "                    \"time_masking\": {\n",
+    "                        \"num_masks\": 10,\n",
+    "                        \"mask_factor\": 100,\n",
+    "                        \"p_upperbound\": 0.05,\n",
+    "                    },\n",
+    "                    \"freq_masking\": {\"num_masks\": 1, \"mask_factor\": 27},\n",
+    "                }\n",
+    "            },\n",
+    "            \"data_paths\": [\n",
+    "                \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv\"\n",
+    "            ],\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": True,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"train\",\n",
+    "        },\n",
+    "        \"eval_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": None,\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"eval\",\n",
+    "        },\n",
+    "        \"test_dataset_config\": {\n",
+    "            \"use_tf\": True,\n",
+    "            \"data_paths\": [\n",
+    "                \"/mnt/h/ML/Datasets/ASR/Raw/LibriSpeech/test-clean/transcripts.tsv\"\n",
+    "            ],\n",
+    "            \"tfrecords_dir\": None,\n",
+    "            \"shuffle\": False,\n",
+    "            \"cache\": True,\n",
+    "            \"buffer_size\": 100,\n",
+    "            \"drop_remainder\": True,\n",
+    "            \"stage\": \"test\",\n",
+    "        },\n",
+    "        \"optimizer_config\": {\"class_name\": \"adam\", \"config\": {\"learning_rate\": 0.0001}},\n",
+    "        \"running_config\": {\n",
+    "            \"batch_size\": 2,\n",
+    "            \"num_epochs\": 20,\n",
+    "            \"checkpoint\": {\n",
+    "                \"filepath\": \"/mnt/e/Models/local/rnn_transducer/checkpoints/{epoch:02d}.h5\",\n",
+    "                \"save_best_only\": True,\n",
+    "                \"save_weights_only\": True,\n",
+    "                \"save_freq\": \"epoch\",\n",
+    "            },\n",
+    "            \"states_dir\": \"/mnt/e/Models/local/rnn_transducer/states\",\n",
+    "            \"tensorboard\": {\n",
+    "                \"log_dir\": \"/mnt/e/Models/local/rnn_transducer/tensorboard\",\n",
+    "                \"histogram_freq\": 1,\n",
+    "                \"write_graph\": True,\n",
+    "                \"write_images\": True,\n",
+    "                \"update_freq\": \"epoch\",\n",
+    "                \"profile_batch\": 2,\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata = {\n",
+    "    \"train\": {\"max_input_length\": 2974, \"max_label_length\": 194, \"num_entries\": 281241},\n",
+    "    \"eval\": {\"max_input_length\": 3516, \"max_label_length\": 186, \"num_entries\": 5567},\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "import argparse\n",
+    "from tensorflow_asr.utils import env_util\n",
+    "\n",
+    "env_util.setup_environment()\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.config.optimizer.set_experimental_options({\"auto_mixed_precision\": True})\n",
+    "strategy = env_util.setup_strategy([0])\n",
+    "\n",
+    "from tensorflow_asr.configs.config import Config\n",
+    "from tensorflow_asr.datasets import asr_dataset\n",
+    "from tensorflow_asr.featurizers import speech_featurizers, text_featurizers\n",
+    "from tensorflow_asr.models.transducer.rnn_transducer import RnnTransducer\n",
+    "from tensorflow_asr.optimizers.schedules import TransformerSchedule\n",
+    "\n",
+    "config = Config(config)\n",
+    "speech_featurizer = speech_featurizers.TFSpeechFeaturizer(config.speech_config)\n",
+    "\n",
+    "text_featurizer = text_featurizers.CharFeaturizer(config.decoder_config)\n",
+    "\n",
+    "train_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.train_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "eval_dataset = asr_dataset.ASRSliceDataset(\n",
+    "    speech_featurizer=speech_featurizer,\n",
+    "    text_featurizer=text_featurizer,\n",
+    "    **vars(config.learning_config.eval_dataset_config),\n",
+    "    indefinite=True\n",
+    ")\n",
+    "\n",
+    "train_dataset.load_metadata(metadata)\n",
+    "eval_dataset.load_metadata(metadata)\n",
+    "speech_featurizer.reset_length()\n",
+    "text_featurizer.reset_length()\n",
+    "\n",
+    "global_batch_size = config.learning_config.running_config.batch_size\n",
+    "global_batch_size *= strategy.num_replicas_in_sync\n",
+    "\n",
+    "train_data_loader = train_dataset.create(global_batch_size)\n",
+    "eval_data_loader = eval_dataset.create(global_batch_size)\n",
+    "\n",
+    "with strategy.scope():\n",
+    "    # build model\n",
+    "    rnnt = RnnTransducer(**config.model_config, vocabulary_size=text_featurizer.num_classes)\n",
+    "    rnnt._build(speech_featurizer.shape)\n",
+    "    rnnt.summary(line_length=100)\n",
+    "\n",
+    "    rnnt.compile(\n",
+    "        optimizer=config.learning_config.optimizer_config,\n",
+    "        experimental_steps_per_execution=10,\n",
+    "        global_batch_size=global_batch_size,\n",
+    "        blank=text_featurizer.blank\n",
+    "    )\n",
+    "\n",
+    "callbacks = [\n",
+    "    tf.keras.callbacks.ModelCheckpoint(**config.learning_config.running_config.checkpoint),\n",
+    "    tf.keras.callbacks.experimental.BackupAndRestore(config.learning_config.running_config.states_dir),\n",
+    "    tf.keras.callbacks.TensorBoard(**config.learning_config.running_config.tensorboard)\n",
+    "]\n",
+    "\n",
+    "rnnt.fit(\n",
+    "    train_data_loader,\n",
+    "    epochs=config.learning_config.running_config.num_epochs,\n",
+    "    validation_data=eval_data_loader,\n",
+    "    callbacks=callbacks,\n",
+    "    steps_per_epoch=train_dataset.total_steps,\n",
+    "    validation_steps=eval_dataset.total_steps\n",
+    ")"
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/scripts/create_vocab_from_trans.py b/scripts/create_vocab_from_trans.py
index a4a2f20c61..a42148a98b 100644
--- a/scripts/create_vocab_from_trans.py
+++ b/scripts/create_vocab_from_trans.py
@@ -17,11 +17,9 @@
 
 parser = argparse.ArgumentParser(prog="Create vocabulary file from transcripts")
 
-parser.add_argument("--output", type=str,
-                    default=None, help="The output .txt vocabulary file path")
+parser.add_argument("--output", type=str, default=None, help="The output .txt vocabulary file path")
 
-parser.add_argument("transcripts", nargs="+", type=str,
-                    default=None, help="Transcript .tsv files")
+parser.add_argument("transcripts", nargs="+", type=str, default=None, help="Transcript .tsv files")
 
 args = parser.parse_args()
 
diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py
index 48b0315943..2d6883d204 100644
--- a/scripts/generate_metadata.py
+++ b/scripts/generate_metadata.py
@@ -28,7 +28,7 @@
 
 parser.add_argument("--sentence_piece", default=False, action="store_true", help="Whether to use `SentencePiece` model")
 
-parser.add_argument("--metadata_prefix", type=str, default=None, help="Path to file containing metadata")
+parser.add_argument("--metadata", type=str, default=None, help="Path to file containing metadata")
 
 parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords")
 
@@ -57,4 +57,4 @@
     stage=args.stage, shuffle=False,
 )
 
-dataset.update_metadata(args.metadata_prefix)
+dataset.update_metadata(args.metadata)
diff --git a/tensorflow_asr/configs/config.py b/tensorflow_asr/configs/config.py
index 028016e853..12fb73a959 100644
--- a/tensorflow_asr/configs/config.py
+++ b/tensorflow_asr/configs/config.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Union
 from ..augmentations.augmentation import Augmentation
 from ..utils import file_util
 
@@ -75,8 +76,8 @@ def __init__(self, config: dict = None):
 class Config:
     """ User config class for training, testing or infering """
 
-    def __init__(self, path: str):
-        config = file_util.load_yaml(file_util.preprocess_paths(path))
+    def __init__(self, data: Union[str, dict]):
+        config = data if isinstance(data, dict) else file_util.load_yaml(file_util.preprocess_paths(data))
         self.speech_config = config.pop("speech_config", {})
         self.decoder_config = config.pop("decoder_config", {})
         self.model_config = config.pop("model_config", {})
diff --git a/tensorflow_asr/datasets/asr_dataset.py b/tensorflow_asr/datasets/asr_dataset.py
index 1b6fdca3b6..2b2e61a7ea 100755
--- a/tensorflow_asr/datasets/asr_dataset.py
+++ b/tensorflow_asr/datasets/asr_dataset.py
@@ -14,6 +14,7 @@
 
 import os
 import json
+from typing import Union
 import tqdm
 import numpy as np
 import tensorflow as tf
@@ -80,24 +81,27 @@ def save_metadata(self, metadata: str = None):
             f.write(json.dumps(content, indent=2))
         print(f"Metadata written to {metadata}")
 
-    def load_metadata(self, metadata: str = None):
+    def load_metadata(self, metadata: Union[str, dict] = None):
         if metadata is None: return
-        metadata = file_util.preprocess_paths(metadata)
-        if tf.io.gfile.exists(metadata):
-            print(f"Loading metadata from {metadata} ...")
-            with tf.io.gfile.GFile(metadata, "r") as f:
-                try:
-                    content = json.loads(f.read()).get(self.stage, {})
-                except json.JSONDecodeError:
-                    raise ValueError(f'File {metadata} must be in json format')
-                self.speech_featurizer.update_length(int(content.get("max_input_length", 0)))
-                self.text_featurizer.update_length(int(content.get("max_label_length", 0)))
-                self.total_steps = int(content.get("num_entries", 0))
-
-    def update_metadata(self, metadata_prefix: str = None):
-        self.load_metadata(metadata_prefix)
+        if isinstance(metadata, dict):
+            content = metadata
+        else:
+            metadata = file_util.preprocess_paths(metadata)
+            if tf.io.gfile.exists(metadata):
+                print(f"Loading metadata from {metadata} ...")
+                with tf.io.gfile.GFile(metadata, "r") as f:
+                    try:
+                        content = json.loads(f.read()).get(self.stage, {})
+                    except json.JSONDecodeError:
+                        raise ValueError(f'File {metadata} must be in json format')
+        self.speech_featurizer.update_length(int(content.get("max_input_length", 0)))
+        self.text_featurizer.update_length(int(content.get("max_label_length", 0)))
+        self.total_steps = int(content.get("num_entries", 0))
+
+    def update_metadata(self, metadata: str = None):
+        self.load_metadata(metadata)
         self.compute_metadata()
-        self.save_metadata(metadata_prefix)
+        self.save_metadata(metadata)
 
     # -------------------------------- ENTRIES -------------------------------------