From 58f9007b0508c29c391661b5f37a5e743885bac9 Mon Sep 17 00:00:00 2001
From: mauicv <a.thornysort@gmail.com>
Date: Sat, 6 Jan 2024 16:52:56 +0000
Subject: [PATCH] Add benchmarking scripts (#7)

---
 .gitignore                          |   5 +-
 Makefile                            |   2 +
 src/benchmarks/profiles.py          |  31 ++++++++
 src/benchmarks/profiling.py         | 117 ++++++++++++++++++++++++++++
 src/pytfex/models/__init__.py       |  34 ++++++++
 src/pytfex/models/basic.py          |  43 ++++++++++
 src/pytfex/models/moe.py            |  49 ++++++++++++
 src/pytfex/transformer/attention.py |   1 +
 src/pytfex/transformer/gpt.py       |   1 -
 src/pytfex/transformer/moe.py       |   2 +-
 src/tests/conftest.py               |  50 +++++++-----
 src/tests/test_model_train.py       |  14 ++--
 12 files changed, 320 insertions(+), 29 deletions(-)
 create mode 100644 Makefile
 create mode 100644 src/benchmarks/profiles.py
 create mode 100644 src/benchmarks/profiling.py
 create mode 100644 src/pytfex/models/__init__.py
 create mode 100644 src/pytfex/models/basic.py
 create mode 100644 src/pytfex/models/moe.py

diff --git a/.gitignore b/.gitignore
index 79c5f39..042581d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,7 @@ venv
 __pycache__
 *.egg-info
 /dist
-/build
\ No newline at end of file
+/build
+
+*.prof
+*.png
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..70c3b01
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,2 @@
+run_profiles:
+	python src/benchmarks/profiles.py
diff --git a/src/benchmarks/profiles.py b/src/benchmarks/profiles.py
new file mode 100644
index 0000000..276f9fc
--- /dev/null
+++ b/src/benchmarks/profiles.py
@@ -0,0 +1,31 @@
+from pytfex.utils import set_seed
+from pytfex.models import get_model, GPTMoEConfig, GPTBasicConfig
+import torch
+from pytfex.utils import count_parameters
+from profiling import Profiling
+
+
+benchmarks = [
+    GPTBasicConfig(num_layers=1, hdn_dim=1024),
+    GPTMoEConfig(num_layers=1, num_experts=21, c=1, hdn_dim=512, ),
+]
+
+for config in benchmarks:
+    print(config)
+    set_seed(0)
+    model = get_model(config)
+    print(f'Number of parameters: {count_parameters(model)}')
+
+    t1 = torch.randint(0, config.vcb_size, (config.batch_size, config.blk_size))
+
+    model.eval()
+    output_1 = model(t1)
+    # with Profiling(model.layers[0].mlp) as p:
+    #     output_2 = model(t1)
+
+    with Profiling(model) as p:
+        output_2 = model(t1)
+
+    assert torch.allclose(output_1, output_2), 'different outputs'
+
+    print(p)
\ No newline at end of file
diff --git a/src/benchmarks/profiling.py b/src/benchmarks/profiling.py
new file mode 100644
index 0000000..b5f967a
--- /dev/null
+++ b/src/benchmarks/profiling.py
@@ -0,0 +1,117 @@
+import torch
+import time
+import string
+import random
+
+alphabet = string.ascii_lowercase + string.digits
+def uuid(length=4):
+    return ''.join(random.choices(alphabet, k=length))
+
+class Profiling(object):
+    def __init__(self, model):
+        if isinstance(model, torch.nn.Module) is False:
+            print("Not a valid model, please provide a 'nn.Module' instance.")
+
+        self.model = model
+        self.record = {
+            'forward':[],
+        }
+        self.profiling_on = True
+        self.origin_call = {}
+        self.hook_done = False
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, *args):
+        self.stop()
+
+    def __str__(self):
+        ret = ""
+        ret += "\n================================= Profile =================================\n"
+        ret += "\nFORWARD TIME:\n"
+
+        ts = self.record['forward'][0][1]
+        te = self.record['forward'][-1][1]
+        ret += f"\nTotal time:\t{1000*(te - ts):.6f} ms\n\n"
+
+        ret += ('-------------------\n')
+        for i, ((name1, ts1, event1), (name2, ts2, event2)) in enumerate(zip(
+                self.record['forward'],
+                self.record['forward'][1:]
+            )):
+            ret += (
+                f"event{i+1:3d}:\t{1000*(ts2 - ts1):10.6f} ms"
+                f"\t({event1}:{name1} -> {event2}:{name2})\n"
+            )
+
+        ret += ('-------------------\n')
+        component_time = 0
+        for name, ts1, ts2 in self.component_events:
+            diff = ts2 - ts1
+            ret += (f"{1000*(diff):0.6f} ms \t ({name}) \n")
+            component_time += diff
+
+        ret += ('-------------------\n')
+        ret += (f"{1000*(component_time):0.6f} ms \t (total-component-time) \n")    
+        ret += (f"{1000*(te - ts - component_time):0.6f} ms \t (others) \n")    
+
+        return ret
+
+    def start(self):
+        if self.hook_done is False:
+            self.hook_done = True
+            self.hook_modules(self.model, self.model.__class__.__name__)
+        self.profiling_on = True
+        return self
+
+    @property
+    def component_events(self):
+        comp_data = {}
+        component_names = []
+        for component_name, ts, event in self.record['forward']:
+            if component_name not in comp_data:
+                comp_data[component_name] = {}
+                component_names.append(component_name)
+            comp_data[component_name][event] = ts
+
+        for component_name in component_names:
+            yield (
+                component_name,
+                comp_data[component_name]['start'],
+                comp_data[component_name]['end'],
+            )
+
+    def stop(self):
+        self.profiling_on = False
+        return self
+
+    def hook_modules(self, module, name):
+        for name, layer in module.named_children():
+            if isinstance(layer, torch.nn.ModuleList):
+                for ind, sub_sub_module in enumerate(layer):
+                    self._hook_module(f'{name}-{ind}', sub_sub_module)
+            else:
+                self._hook_module(name, layer)
+
+    def _hook_module(self, name, layer):
+        uid = uuid(length=4)
+        name = name + '-' + uid
+        def make_hook(event):
+            def hook(layer, *args, **kwargs):
+                t = time.time()
+                if (self.profiling_on):
+                    self.record['forward'].append(
+                        (name, t, event)
+                    )
+
+            return hook
+    
+        layer.register_forward_hook(
+            make_hook('end')
+        )
+        layer.register_forward_pre_hook(
+            make_hook('start')
+        )
+
diff --git a/src/pytfex/models/__init__.py b/src/pytfex/models/__init__.py
new file mode 100644
index 0000000..8cc2b63
--- /dev/null
+++ b/src/pytfex/models/__init__.py
@@ -0,0 +1,34 @@
+from pytfex.models.moe import get_moe_gpt_config
+from pytfex.models.basic import get_basic_gpt_config
+from pytfex.transformer.make_model import init_from_yml_string
+from dataclasses import dataclass
+
+
+@dataclass
+class GPTMoEConfig:
+    model_type: str = 'gpt-moe'
+    vcb_size: int = 65
+    hdn_dim: int = 256
+    blk_size: int = 256
+    c: int = 2
+    num_experts: int = 4
+    batch_size: int = 32
+    num_layers: int = 2
+
+
+@dataclass
+class GPTBasicConfig:
+    model_type: str = 'gpt-basic'
+    vcb_size: int = 65
+    hdn_dim: int = 256
+    blk_size: int = 256
+    batch_size: int = 32
+    num_layers: int = 2
+
+
+def get_model(config):
+    config_str = {
+        'gpt-moe': get_moe_gpt_config,
+        'gpt-basic': get_basic_gpt_config,
+    }[config.model_type](config)
+    return init_from_yml_string(config_str)
diff --git a/src/pytfex/models/basic.py b/src/pytfex/models/basic.py
new file mode 100644
index 0000000..d087d9b
--- /dev/null
+++ b/src/pytfex/models/basic.py
@@ -0,0 +1,43 @@
+def get_basic_gpt_config(config):
+
+    return f"""
+        type: 'GPT'
+        params:
+            dropout: 0.5
+            hidden_dim: {config.hdn_dim}
+            num_heads: 4
+            dropout: 0.5
+            embedder:
+                type: 'MultiEmbedder'
+                params:
+                    embedders:
+                        -   type: 'TokenEmbedder' 
+                            params:
+                                dictionary_size: {config.vcb_size}
+                                hidden_dim: {config.hdn_dim}
+                        -   type: 'PositionEmbedder'
+                            params:
+                                num_positions: {config.blk_size}
+                                hidden_dim: {config.hdn_dim}
+            head:
+                type: 'ClassificationHead'
+                params:
+                    hidden_dim: {config.hdn_dim}
+                    vocab_size: {config.vcb_size}
+            layers:
+                -   num: {config.num_layers}
+                    type: 'TransformerLayer'
+                    params:
+                        hidden_dim: {config.hdn_dim}
+                        attn:
+                            type: 'Attention'
+                            params:
+                                hidden_dim: {config.hdn_dim}
+                                num_heads: 4
+                                dropout: 0.5
+                        mlp:
+                            type: 'MLP'
+                            params:
+                                hidden_dim: {config.hdn_dim}
+                                dropout: 0.5
+        """
\ No newline at end of file
diff --git a/src/pytfex/models/moe.py b/src/pytfex/models/moe.py
new file mode 100644
index 0000000..ff99bd7
--- /dev/null
+++ b/src/pytfex/models/moe.py
@@ -0,0 +1,49 @@
+def get_moe_gpt_config(config):
+    return f"""
+        type: 'GPT'
+        params:
+            dropout: 0.5
+            hidden_dim: {config.hdn_dim}
+            num_heads: 4
+            dropout: 0.5
+            embedder:
+                type: 'MultiEmbedder'
+                params:
+                    embedders:
+                        -   type: 'TokenEmbedder' 
+                            params:
+                                dictionary_size: {config.vcb_size}
+                                hidden_dim: {config.hdn_dim}
+                        -   type: 'PositionEmbedder'
+                            params:
+                                num_positions: {config.blk_size}
+                                hidden_dim: {config.hdn_dim}
+            layers:
+                -   num: {config.num_layers}
+                    type: 'TransformerLayer'
+                    params:
+                        hidden_dim: {config.hdn_dim}
+                        attn:
+                            type: 'Attention'
+                            params:
+                                hidden_dim: {config.hdn_dim}
+                                num_heads: 4
+                                dropout: 0.5
+                        mlp:
+                            type: 'MoE'
+                            params:
+                                hidden_dim: {config.hdn_dim}
+                                c: {config.c}
+                                experts:
+                                    -   num: {config.num_experts}
+                                        type: 'MLP'
+                                        params:
+                                            hidden_dim: {config.hdn_dim}
+                                            intermediate_dim: {4*config.hdn_dim}
+                                            dropout: 0.5
+            head:
+                type: 'ClassificationHead'
+                params:
+                    hidden_dim: {config.hdn_dim}
+                    vocab_size: {config.vcb_size}
+        """
diff --git a/src/pytfex/transformer/attention.py b/src/pytfex/transformer/attention.py
index 0fee829..d2ec766 100644
--- a/src/pytfex/transformer/attention.py
+++ b/src/pytfex/transformer/attention.py
@@ -8,6 +8,7 @@ def __init__(
             dropout: float=0.5,
         ) -> None:
         super(Attention, self).__init__()
+        assert hidden_dim % num_heads == 0, f"num_heads must divide hidden_dim, {hidden_dim=}, {num_heads=}"
         self.hidden_dim = hidden_dim
         self.num_heads = num_heads
         self.dropout = torch.tensor(
diff --git a/src/pytfex/transformer/gpt.py b/src/pytfex/transformer/gpt.py
index 7a76d77..4f82831 100644
--- a/src/pytfex/transformer/gpt.py
+++ b/src/pytfex/transformer/gpt.py
@@ -14,7 +14,6 @@ def __init__(
             head: torch.nn.Module=None,
         ):
         super(GPT, self).__init__()
-        assert hidden_dim % num_heads == 0, "num_heads must divide hidden_dim"
         self.hidden_dim = hidden_dim
         self.num_heads = num_heads
         self.dropout = dropout
diff --git a/src/pytfex/transformer/moe.py b/src/pytfex/transformer/moe.py
index 7d1c9f7..a3309d4 100644
--- a/src/pytfex/transformer/moe.py
+++ b/src/pytfex/transformer/moe.py
@@ -41,7 +41,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         b, l, *_ = x.shape
         k = self._compute_k(l)
-        S = torch.softmax(self.gate(x), dim=-1)
+        S = torch.sigmoid(self.gate(x))
         S = S.transpose(1, 2) # (batch_size, num_experts, tokens)
         G, I = torch.topk(S, k, dim=-1)
         # I - (batch_size, num_experts, top_k_tokens) - indices
diff --git a/src/tests/conftest.py b/src/tests/conftest.py
index 307f530..fb24e5b 100644
--- a/src/tests/conftest.py
+++ b/src/tests/conftest.py
@@ -1,38 +1,48 @@
 from torch.utils.data.dataloader import DataLoader
 from tests.dataset import SortDataset
-
-from pytfex.transformer.make_model import init_from_yml_string
 from pytfex.utils import set_seed
-from tests.basic_model import get_basic_gpt_config
-from tests.moe_model import get_moe_gpt_config
 
-import torch
+from pytfex.models import (
+    get_model,
+    GPTMoEConfig,
+    GPTBasicConfig,
+)
 
+import torch
 import pytest
 
 
 @pytest.fixture(params=[
-    (256, 6, 3, 32, None, None, 'gpt-basic'), # (hdn_dim, length, num_digits, batch_size, _, _, model_type)
-    (256, 6, 3, 32, 2, 4, 'gpt-moe') # (hdn_dim, length, num_digits, batch_size, k, num_experts, model_type)
+    # (model_type, hdn_dim, length, num_digits, batch_size, _, _, _)
+    (GPTBasicConfig(
+        model_type='gpt-basic',
+        vcb_size=3,
+        hdn_dim=256,
+        blk_size=12,
+        batch_size=32,
+    ), 6),
+    # (model_type, hdn_dim, length, num_digits, batch_size, k, num_experts, _)
+    (GPTMoEConfig(
+        model_type='gpt-moe',
+        vcb_size=3,
+        hdn_dim=256,
+        blk_size=12,
+        c=2,
+        num_experts=4,
+        batch_size=32,
+    ), 6)
 ])
 def training_setup(request):
     set_seed(0)
-
-    hdn_dim, length, num_digits, batch_size, c, num_experts, model_type = request.param
+    config, length = request.param
+    num_digits = config.vcb_size
     ds = SortDataset(split='train', length=length, num_digits=num_digits)
-    dl = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=0)
-    blk_size = ds.get_block_size()
-    vcb_size = ds.get_vocab_size()
-
-    config = {
-        'gpt-basic': lambda: get_basic_gpt_config(vcb_size, hdn_dim, blk_size),
-        'gpt-moe': lambda: get_moe_gpt_config(vcb_size, hdn_dim, blk_size, c, num_experts)
-    }[model_type]()
-    model = init_from_yml_string(config)
+    dl = DataLoader(ds, batch_size=config.batch_size, shuffle=True, num_workers=0)
+    model = get_model(config)
 
     def val_fn(model):
         ds = SortDataset(split='test', length=length, num_digits=num_digits)
-        dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0)
+        dl = DataLoader(ds, batch_size=config.batch_size, shuffle=False, num_workers=0)
         total = 0
         sum_acc = 0
         for x, y_true in dl:
@@ -47,4 +57,4 @@ def val_fn(model):
         acc = sum_acc / total
         return acc
 
-    return dl, model, val_fn, model_type
\ No newline at end of file
+    return dl, model, val_fn, config.model_type
\ No newline at end of file
diff --git a/src/tests/test_model_train.py b/src/tests/test_model_train.py
index a1bcce7..f24620b 100644
--- a/src/tests/test_model_train.py
+++ b/src/tests/test_model_train.py
@@ -1,7 +1,8 @@
 from pytfex.utils import set_seed, count_parameters
 
-import pytest
 import torch
+import time
+import pytest
 
 
 @pytest.mark.skip(reason="Slow running/intermittent test")
@@ -16,9 +17,10 @@ def test_train(training_setup):
     print('\n')
     print(f'-- model-type : {model_type} --')
     print(f'-- # params   : {count_parameters(model)} --')
-    print('epoch_|_loss_____|_acc______')
-    print(f'    -1| None     | {acc:0.5}')
-    for epoch in range(5):
+    print('epoch_|_loss_____|_acc______|_time____')
+    print(f'    -1| None     | {acc:0.5}  | None')
+    for epoch in range(10):
+        s = time.time()
         for x, y_true in dl:
             b, l = x.shape
             opt.zero_grad()
@@ -30,9 +32,9 @@ def test_train(training_setup):
             loss = loss_fn(y, y_true)
             loss.backward()
             opt.step()
-
+        e = time.time()
         acc = val_fn(model)
-        print(f'{epoch:>6}| {loss.item():<8.5} | {acc:0.5}')
+        print(f'{epoch:>6}| {loss.item():<8.5} | {acc:0.5}  | {e-s:0.5}')
 
     assert loss.item() < 0.15
     acc = val_fn(model)