Migrating Train a Vision Transformer on small datasets example to Keras 3 #1991

sitamgithub-MSIT · 2024-11-21T20:05:26Z

This PR changes the Train a Vision Transformer on small datasets example to keras 3.0 [TF-Only Example].

For example, here is the notebook link provided:
https://colab.research.google.com/drive/1ugp-3Zkkev9RNfuboWTFpS5202hhUcFv?usp=sharing

The following describes the Git difference for the changed files:

Changes:

diff --git a/examples/vision/vit_small_ds.py b/examples/vision/vit_small_ds.py
index 40ef2c52..658068dc 100644
--- a/examples/vision/vit_small_ds.py
+++ b/examples/vision/vit_small_ds.py
@@ -35,25 +35,23 @@ This example implements the ideas of the paper. A large part of this
 example is inspired from
 [Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
 
-_Note_: This example requires TensorFlow 2.6 or higher, as well as
-[TensorFlow Addons](https://www.tensorflow.org/addons), which can be
-installed using the following command:
-
-```python
-pip install -qq -U tensorflow-addons
+_Note_: This example requires TensorFlow 2.6 or higher.

"""
"""

Setup

"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"

import math
import numpy as np
+import keras
+from keras import ops
+from keras import layers
import tensorflow as tf
-from tensorflow import keras
-import tensorflow_addons as tfa
import matplotlib.pyplot as plt
-from tensorflow.keras import layers

Setting seed for reproducibiltiy

SEED = 42
@@ -216,7 +214,7 @@ class ShiftedPatchTokenization(layers.Layer):
def call(self, images):
if not self.vanilla:
# Concat the shifted images with the original image

```
       images = tf.concat(
```

       images = ops.concatenate(
           [
               images,
               self.crop_shift_pad(images, mode="left-up"),

@@ -252,8 +250,9 @@ class ShiftedPatchTokenization(layers.Layer):

Get a random image from the training dataset

and resize the image

image = x_train[np.random.choice(range(x_train.shape[0]))]
-resized_image = tf.image.resize(

tf.convert_to_tensor([image]), size=(IMAGE_SIZE, IMAGE_SIZE)
+resized_image = ops.cast(

ops.image.resize(ops.convert_to_tensor([image]), size=(IMAGE_SIZE, IMAGE_SIZE)),
dtype="float32",
)

Vanilla patch maker: This takes an image and divides into

@@ -267,7 +266,7 @@ for row in range(n):
for col in range(n):
plt.subplot(n, n, count)
count = count + 1

   image = tf.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 3))

   image = ops.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 3))
   plt.imshow(image)
   plt.axis("off")

plt.show()
@@ -286,7 +285,7 @@ for index, name in enumerate(shifted_images):
for col in range(n):
plt.subplot(n, n, count)
count = count + 1

       image = tf.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 5 * 3))

       image = ops.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 5 * 3))
       plt.imshow(image[..., 3 * index : 3 * index + 3])
       plt.axis("off")

plt.show()
@@ -308,7 +307,7 @@ class PatchEncoder(layers.Layer):
self.position_embedding = layers.Embedding(
input_dim=num_patches, output_dim=projection_dim
)

   self.positions = tf.range(start=0, limit=self.num_patches, delta=1)

```
   self.positions = ops.arange(start=0, stop=self.num_patches, step=1)
```
def call(self, encoded_patches):
encoded_positions = self.position_embedding(self.positions)
@@ -355,7 +354,7 @@ at a later stage.
"""

-class MultiHeadAttentionLSA(tf.keras.layers.MultiHeadAttention):
+class MultiHeadAttentionLSA(layers.MultiHeadAttention):
def init(self, **kwargs):
super().init(**kwargs)
# The trainable temperature term. The initial value is
@@ -363,13 +362,13 @@ class MultiHeadAttentionLSA(tf.keras.layers.MultiHeadAttention):
self.tau = tf.Variable(math.sqrt(float(self._key_dim)), trainable=True)

 def _compute_attention(self, query, key, value, attention_mask=None, training=None):

   query = tf.multiply(query, 1.0 / self.tau)

   attention_scores = tf.einsum(self._dot_product_equation, key, query)

   query = ops.multiply(query, 1.0 / self.tau)

   attention_scores = ops.einsum(self._dot_product_equation, key, query)
   attention_scores = self._masked_softmax(attention_scores, attention_mask)
   attention_scores_dropout = self._dropout_layer(
       attention_scores, training=training
   )

```
   attention_output = tf.einsum(
```

   attention_output = ops.einsum(
       self._combine_equation, attention_scores_dropout, value
   )
   return attention_output, attention_scores

@@ -382,14 +381,14 @@ class MultiHeadAttentionLSA(tf.keras.layers.MultiHeadAttention):

def mlp(x, hidden_units, dropout_rate):
for units in hidden_units:

   x = layers.Dense(units, activation=tf.nn.gelu)(x)

   x = layers.Dense(units, activation="gelu")(x)
   x = layers.Dropout(dropout_rate)(x)

return x

Build the diagonal attention mask

-diag_attn_mask = 1 - tf.eye(NUM_PATCHES)
-diag_attn_mask = tf.cast([diag_attn_mask], dtype=tf.int8)
+diag_attn_mask = 1 - ops.eye(NUM_PATCHES)
+diag_attn_mask = ops.cast([diag_attn_mask], dtype="int8")

"""

Build the ViT

@@ -463,9 +462,9 @@ class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
if self.total_steps < self.warmup_steps:
raise ValueError("Total_steps must be larger or equal to warmup_steps.")

```
   cos_annealed_lr = tf.cos(
```

   cos_annealed_lr = ops.cos(
       self.pi

       * (tf.cast(step, tf.float32) - self.warmup_steps)

       * (ops.cast(step, dtype="float32") - self.warmup_steps)
       / float(self.total_steps - self.warmup_steps)
   )
   learning_rate = 0.5 * self.learning_rate_base * (1 + cos_annealed_lr)

@@ -479,11 +478,13 @@ class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
slope = (
self.learning_rate_base - self.warmup_learning_rate
) / self.warmup_steps

       warmup_rate = slope * tf.cast(step, tf.float32) + self.warmup_learning_rate

```
       learning_rate = tf.where(
```

```
       warmup_rate = (
```

           slope * ops.cast(step, dtype="float32") + self.warmup_learning_rate

```
       )
```

       learning_rate = ops.where(
           step < self.warmup_steps, warmup_rate, learning_rate
       )

```
   return tf.where(
```

   return ops.where(
       step > self.total_steps, 0.0, learning_rate, name="learning_rate"
   )

@@ -499,7 +500,7 @@ def run_experiment(model):
warmup_steps=warmup_steps,
)

optimizer = tfa.optimizers.AdamW(

optimizer = keras.optimizers.AdamW(
learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
)

(END)

fchollet

Thanks for the PR! I think we could go further:

Use keras.ops.image.extract_patches instead of tf.image.extract_patches
Use keras.ops.array instead of tf.constant
Use keras.ops.image.crop_images instead of tf.image.crop_to_bounding_box
Use keras.ops.image.pad_images instead of tf.image.pad_to_bounding_box

After this I believe the example should be backend-agnostic.

fchollet · 2024-11-25T20:03:41Z

examples/vision/vit_small_ds.py

@@ -355,21 +354,21 @@ def call(self, encoded_patches):
 """


-class MultiHeadAttentionLSA(tf.keras.layers.MultiHeadAttention):
+class MultiHeadAttentionLSA(layers.MultiHeadAttention):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # The trainable temperature term. The initial value is
        # the square root of the key dimension.
        self.tau = tf.Variable(math.sqrt(float(self._key_dim)), trainable=True)


Please replace this with keras.Variable()

sitamgithub-MSIT · 2024-11-25T23:23:15Z

@fchollet All changes have been added, and it is completely backend agnostic now; tested with all three backends.

migrate vit small dataset example to keras3

0631aec

github-actions bot assigned sachinprasadhs Nov 21, 2024

fchollet reviewed Nov 25, 2024

View reviewed changes

requested changes added and backend agnostic done

fe947ef

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Migrating Train a Vision Transformer on small datasets example to Keras 3 #1991

Migrating Train a Vision Transformer on small datasets example to Keras 3 #1991

sitamgithub-MSIT commented Nov 21, 2024

fchollet left a comment

fchollet Nov 25, 2024

sitamgithub-MSIT Nov 25, 2024

sitamgithub-MSIT commented Nov 25, 2024

Migrating Train a Vision Transformer on small datasets example to Keras 3 #1991

Are you sure you want to change the base?

Migrating Train a Vision Transformer on small datasets example to Keras 3 #1991

Conversation

sitamgithub-MSIT commented Nov 21, 2024

Setup

Setting seed for reproducibiltiy

Get a random image from the training dataset

and resize the image

Vanilla patch maker: This takes an image and divides into

Build the diagonal attention mask

Build the ViT

fchollet left a comment

Choose a reason for hiding this comment

fchollet Nov 25, 2024

Choose a reason for hiding this comment

sitamgithub-MSIT Nov 25, 2024

Choose a reason for hiding this comment

sitamgithub-MSIT commented Nov 25, 2024