diff --git a/README.md b/README.md
index dd0f5c8..0e96a4a 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ TCN(
     kernel_initializer='he_normal',
     use_batch_norm=False,
     use_layer_norm=False,
-    use_weight_norm=False,
     go_backwards=False,
     return_state=False,
     **kwargs
@@ -64,7 +63,6 @@ TCN(
 - `kernel_initializer`: Initializer for the kernel weights matrix (Conv1D).
 - `use_batch_norm`: Whether to use batch normalization in the residual layers or not.
 - `use_layer_norm`: Whether to use layer normalization in the residual layers or not.
-- `use_weight_norm`: Whether to use weight normalization in the residual layers or not.
 - `go_backwards`: Boolean (default False). If True, process the input sequence backwards and return the reversed sequence.
 - `return_state`: Boolean. Whether to return the last state in addition to the output. Default: False.
 - `kwargs`: Any other set of arguments for configuring the parent class Layer. For example "name=str", Name of the model. Use unique names when using multiple TCN.
@@ -96,7 +94,7 @@ Here are some of my notes regarding my experience using TCN:
 - `activation`: Leave it to default. I have never changed it.
 - `kernel_initializer`: If the training of the TCN gets stuck, it might be worth changing this parameter. For example: `glorot_uniform`.
 
-- `use_batch_norm`, `use_weight_norm`, `use_layer_norm`: Use normalization if your network is big enough and the task contains enough data. I usually prefer using `use_layer_norm`, but you can try them all and see which one works the best.
+- `use_batch_norm`, `use_layer_norm`: Use normalization if your network is big enough and the task contains enough data. I usually prefer using `use_layer_norm`, but you can try them both and see which one works the best.
 
 
 ### Receptive field
diff --git a/setup.py b/setup.py
index ebc72e4..3666590 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     os.environ['GRPC_PYTHON_BUILD_SYSTEM_OPENSSL'] = '1'
     os.environ['GRPC_PYTHON_BUILD_SYSTEM_ZLIB'] = '1'
 
-install_requires = ['numpy', tensorflow, 'tensorflow_addons']
+install_requires = ['numpy', tensorflow]
 
 setup(
     name='keras-tcn',
diff --git a/tasks/adding_problem/main.py b/tasks/adding_problem/main.py
index 9eb8acf..d2c39e3 100644
--- a/tasks/adding_problem/main.py
+++ b/tasks/adding_problem/main.py
@@ -26,7 +26,6 @@ def run_task():
         nb_stacks=1,
         max_len=x_train.shape[1],
         use_skip_connections=False,
-        use_weight_norm=True,
         regression=True,
         dropout_rate=0
     )
diff --git a/tasks/copy_memory/main.py b/tasks/copy_memory/main.py
index 1c4ddbe..318fb4b 100644
--- a/tasks/copy_memory/main.py
+++ b/tasks/copy_memory/main.py
@@ -30,7 +30,6 @@ def run_task():
                          use_skip_connections=True,
                          opt='rmsprop',
                          lr=5e-4,
-                         use_weight_norm=True,
                          return_sequences=True)
 
     print(f'x_train.shape = {x_train.shape}')
diff --git a/tasks/mnist_pixel/main.py b/tasks/mnist_pixel/main.py
index 48c548c..6f73150 100644
--- a/tasks/mnist_pixel/main.py
+++ b/tasks/mnist_pixel/main.py
@@ -14,7 +14,6 @@ def run_task():
                          dilations=[2 ** i for i in range(9)],
                          nb_stacks=1,
                          max_len=x_train[0:1].shape[1],
-                         use_weight_norm=True,
                          use_skip_connections=True)
 
     print(f'x_train.shape = {x_train.shape}')
diff --git a/tasks/time_series_forecasting.py b/tasks/time_series_forecasting.py
index 1baf6d8..963738d 100644
--- a/tasks/time_series_forecasting.py
+++ b/tasks/time_series_forecasting.py
@@ -38,7 +38,6 @@
         kernel_size=2,
         use_skip_connections=False,
         use_batch_norm=False,
-        use_weight_norm=False,
         use_layer_norm=False
         ),
     Dense(1, activation='linear')
diff --git a/tcn/tcn.py b/tcn/tcn.py
index d8b9f66..9483c37 100644
--- a/tcn/tcn.py
+++ b/tcn/tcn.py
@@ -28,6 +28,7 @@ class ResidualBlock(Layer):
 
     def __init__(self,
                  dilation_rate: int,
+                 residual_depth: int,
                  nb_filters: int,
                  kernel_size: int,
                  padding: str,
@@ -36,13 +37,13 @@ def __init__(self,
                  kernel_initializer: str = 'he_normal',
                  use_batch_norm: bool = False,
                  use_layer_norm: bool = False,
-                 use_weight_norm: bool = False,
                  **kwargs):
         """Defines the residual block for the WaveNet TCN
         Args:
             x: The previous layer in the model
             training: boolean indicating whether the layer should behave in training mode or in inference mode
             dilation_rate: The dilation power of 2 we are using for this residual block
+            residual_depth: The number of residual convolutions to use in this block
             nb_filters: The number of convolutional filters to use in this block
             kernel_size: The size of the convolutional kernel
             padding: The padding used in the convolutional layers, 'same' or 'causal'.
@@ -51,11 +52,11 @@ def __init__(self,
             kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
             use_batch_norm: Whether to use batch normalization in the residual layers or not.
             use_layer_norm: Whether to use layer normalization in the residual layers or not.
-            use_weight_norm: Whether to use weight normalization in the residual layers or not.
             kwargs: Any initializers for Layer class.
         """
 
         self.dilation_rate = dilation_rate
+        self.residual_depth = residual_depth
         self.nb_filters = nb_filters
         self.kernel_size = kernel_size
         self.padding = padding
@@ -63,7 +64,6 @@ def __init__(self,
         self.dropout_rate = dropout_rate
         self.use_batch_norm = use_batch_norm
         self.use_layer_norm = use_layer_norm
-        self.use_weight_norm = use_weight_norm
         self.kernel_initializer = kernel_initializer
         self.layers = []
         self.shape_match_conv = None
@@ -88,7 +88,7 @@ def build(self, input_shape):
             self.layers = []
             self.res_output_shape = input_shape
 
-            for k in range(2):  # dilated conv block.
+            for k in range(self.residual_depth):  # dilated conv block.
                 name = 'conv1D_{}'.format(k)
                 with K.name_scope(name):  # name scope used to make sure weights get unique names
                     conv = Conv1D(
@@ -99,11 +99,6 @@ def build(self, input_shape):
                         name=name,
                         kernel_initializer=self.kernel_initializer
                     )
-                    if self.use_weight_norm:
-                        from tensorflow_addons.layers import WeightNormalization
-                        # wrap it. WeightNormalization API is different than BatchNormalization or LayerNormalization.
-                        with K.name_scope('norm_{}'.format(k)):
-                            conv = WeightNormalization(conv)
                     self._build_layer(conv)
 
                 with K.name_scope('norm_{}'.format(k)):
@@ -111,8 +106,6 @@ def build(self, input_shape):
                         self._build_layer(BatchNormalization())
                     elif self.use_layer_norm:
                         self._build_layer(LayerNormalization())
-                    elif self.use_weight_norm:
-                        pass  # done above.
 
                 with K.name_scope('act_and_dropout_{}'.format(k)):
                     self._build_layer(Activation(self.activation, name='Act_Conv1D_{}'.format(k)))
@@ -191,6 +184,7 @@ class TCN(Layer):
             nb_filters: The number of filters to use in the convolutional layers. Can be a list.
             kernel_size: The size of the kernel to use in each convolutional layer.
             dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
+            residual_depth: The depth of a residual block. Default is 2.
             nb_stacks : The number of stacks of residual blocks to use.
             padding: The padding to use in the convolutional layers, 'causal' or 'same'.
             use_skip_connections: Boolean. If we want to add skip connections from input to each residual blocK.
@@ -200,7 +194,6 @@ class TCN(Layer):
             kernel_initializer: Initializer for the kernel weights matrix (Conv1D).
             use_batch_norm: Whether to use batch normalization in the residual layers or not.
             use_layer_norm: Whether to use layer normalization in the residual layers or not.
-            use_weight_norm: Whether to use weight normalization in the residual layers or not.
             go_backwards: Boolean (default False). If True, process the input sequence backwards and
             return the reversed sequence.
             return_state: Boolean. Whether to return the last state in addition to the output. Default: False.
@@ -213,8 +206,9 @@ class TCN(Layer):
     def __init__(self,
                  nb_filters=64,
                  kernel_size=3,
-                 nb_stacks=1,
                  dilations=(1, 2, 4, 8, 16, 32),
+                 residual_depth=2,
+                 nb_stacks=1,
                  padding='causal',
                  use_skip_connections=True,
                  dropout_rate=0.0,
@@ -223,7 +217,6 @@ def __init__(self,
                  kernel_initializer='he_normal',
                  use_batch_norm=False,
                  use_layer_norm=False,
-                 use_weight_norm=False,
                  go_backwards=False,
                  return_state=False,
                  **kwargs):
@@ -232,6 +225,7 @@ def __init__(self,
         self.dropout_rate = dropout_rate
         self.use_skip_connections = use_skip_connections
         self.dilations = dilations
+        self.residual_depth = residual_depth
         self.nb_stacks = nb_stacks
         self.kernel_size = kernel_size
         self.nb_filters = nb_filters
@@ -240,7 +234,6 @@ def __init__(self,
         self.kernel_initializer = kernel_initializer
         self.use_batch_norm = use_batch_norm
         self.use_layer_norm = use_layer_norm
-        self.use_weight_norm = use_weight_norm
         self.go_backwards = go_backwards
         self.return_state = return_state
         self.skip_connections = []
@@ -251,7 +244,10 @@ def __init__(self,
         self.output_slice_index = None  # in case return_sequence=False
         self.padding_same_and_time_dim_unknown = False  # edge case if padding='same' and time_dim = None
 
-        if self.use_batch_norm + self.use_layer_norm + self.use_weight_norm > 1:
+        if self.residual_depth < 1:
+            raise ValueError('Residual depth must be at least 1.')
+
+        if self.use_batch_norm + self.use_layer_norm > 1:
             raise ValueError('Only one normalization can be specified at once.')
 
         if isinstance(self.nb_filters, list):
@@ -268,7 +264,7 @@ def __init__(self,
 
     @property
     def receptive_field(self):
-        return 1 + 2 * (self.kernel_size - 1) * self.nb_stacks * sum(self.dilations)
+        return 1 + self.residual_depth * (self.kernel_size - 1) * self.nb_stacks * sum(self.dilations)
 
     def tolist(self, shape):
         try:
@@ -291,6 +287,7 @@ def build(self, input_shape):
             for i, d in enumerate(self.dilations):
                 res_block_filters = self.nb_filters[i] if isinstance(self.nb_filters, list) else self.nb_filters
                 self.residual_blocks.append(ResidualBlock(dilation_rate=d,
+                                                          residual_depth=self.residual_depth,
                                                           nb_filters=res_block_filters,
                                                           kernel_size=self.kernel_size,
                                                           padding=self.padding,
@@ -298,7 +295,6 @@ def build(self, input_shape):
                                                           dropout_rate=self.dropout_rate,
                                                           use_batch_norm=self.use_batch_norm,
                                                           use_layer_norm=self.use_layer_norm,
-                                                          use_weight_norm=self.use_weight_norm,
                                                           kernel_initializer=self.kernel_initializer,
                                                           name='residual_block_{}'.format(len(self.residual_blocks))))
                 # build newest residual block
@@ -355,7 +351,7 @@ def call(self, inputs, training=None, **kwargs):
             self.skip_connections.append(skip_out)
             self.layers_outputs.append(x)
 
-        if self.use_skip_connections:
+        if self.use_skip_connections and len(self.skip_connections) > 0:
             if len(self.skip_connections) > 1:
                 # Keras: A merge layer should be called on a list of at least 2 inputs. Got 1 input.
                 x = layers.add(self.skip_connections, name='Add_Skip_Connections')
@@ -388,7 +384,6 @@ def get_config(self):
         config['activation'] = self.activation_name
         config['use_batch_norm'] = self.use_batch_norm
         config['use_layer_norm'] = self.use_layer_norm
-        config['use_weight_norm'] = self.use_weight_norm
         config['kernel_initializer'] = self.kernel_initializer
         config['go_backwards'] = self.go_backwards
         config['return_state'] = self.return_state
@@ -414,8 +409,7 @@ def compiled_tcn(num_feat,  # type: int
                  opt='adam',
                  lr=0.002,
                  use_batch_norm=False,
-                 use_layer_norm=False,
-                 use_weight_norm=False):
+                 use_layer_norm=False,):
     # type: (...) -> Model
     """Creates a compiled TCN model for a given task (i.e. regression or classification).
     Classification uses a sparse categorical loss. Please input class ids and not one-hot encodings.
@@ -440,7 +434,6 @@ def compiled_tcn(num_feat,  # type: int
         lr: Learning rate.
         use_batch_norm: Whether to use batch normalization in the residual layers or not.
         use_layer_norm: Whether to use layer normalization in the residual layers or not.
-        use_weight_norm: Whether to use weight normalization in the residual layers or not.
     Returns:
         A compiled keras TCN.
     """
@@ -451,8 +444,7 @@ def compiled_tcn(num_feat,  # type: int
 
     x = TCN(nb_filters, kernel_size, nb_stacks, dilations, padding,
             use_skip_connections, dropout_rate, return_sequences,
-            activation, kernel_initializer, use_batch_norm, use_layer_norm,
-            use_weight_norm, name=name)(input_layer)
+            activation, kernel_initializer, use_batch_norm, use_layer_norm, name=name)(input_layer)
 
     print('x.shape=', x.shape)