-
Notifications
You must be signed in to change notification settings - Fork 3
/
notebook_utils.py
179 lines (148 loc) · 5.22 KB
/
notebook_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
import inspect
import logging
import os
import pickle
import subprocess
from typing import Any, Dict
import matplotlib.axes
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from torch import cuda, nn
from nanoGPT.config import train_shakespeare_char
from nanoGPT.model import GPTConfig
try:
from train_ipu import run_training
device = "ipu"
except ImportError: # not on IPU...
from train import run_training
device = "cuda" if cuda.is_available() else "cpu"
def config_dict_from_module(module) -> Dict[str, Any]:
return {k: v for k, v in vars(module).items() if not k.startswith("__")}
def extract_model_params(config):
return {
k: v
for k, v in config.items()
if k in inspect.signature(GPTConfig).parameters.keys()
}
_general_config_dict = config_dict_from_module(train_shakespeare_char)
_general_config_dict["compile"] = False # We'll do this in the notebook when necessary
_model_config_dict = extract_model_params(_general_config_dict)
_model_config_dict["vocab_size"] = 65 # Generated from data/shakespeare_char/prepare.py
config = GPTConfig(**_model_config_dict)
for key, value in _general_config_dict.items():
setattr(config, key, value)
def _gen_experiment_name(model: nn.Module) -> str:
backend_names = [
b.__qualname__
for b in getattr(model, "backends", [])
if hasattr(b, "__qualname__")
]
if backend_names:
us = any("unit_scaling" in b for b in backend_names)
fp8 = any("quantisation" in b for b in backend_names)
if us and not fp8:
return f"unit_scaled_gpt"
if fp8 and not us:
return f"fp8_gpt"
if us and fp8:
return f"unit_scaled_fp8_gpt"
return "gpt"
data_dir = "nanoGPT/data/shakespeare_char"
def download_train_data():
cwd = os.getcwd()
os.chdir(data_dir)
print(f"Downloading training data/tokenizer to: {data_dir}")
subprocess.run(["python", "prepare.py"])
os.chdir(cwd)
class NanoGPTTokenizer:
def __init__(self):
meta_file = "nanoGPT/data/shakespeare_char/meta.pkl"
if not os.path.exists(meta_file):
download_train_data()
with open(meta_file, "rb") as f:
meta = pickle.load(f)
stoi = meta["stoi"]
self.encode_fn = lambda s: [stoi.get(c, stoi[" "]) for c in s]
@property
def pad_token(self):
return self.encode_fn(" ")[0]
def __call__(self, seqs, max_length, *args, **kwargs):
batch = []
for s in seqs:
new_s = self.encode_fn(s)[:max_length]
if len(new_s) < max_length:
new_s += self.pad_token * (max_length - len(new_s))
batch.append(new_s)
batch = torch.tensor(batch)
return {
"input_ids": batch,
"attention_mask": torch.ones_like(batch), # nanoGPT ignores this anyway
}
def plot(df: pd.DataFrame, name: str) -> matplotlib.axes.Axes:
sns.set_theme()
ax = plt.gca()
hue = ax._get_lines.get_next_color()
for kind, d in df.groupby("Train/Valid"):
ax.plot(
d["Steps"],
d["Loss"],
solid_joinstyle="miter",
solid_capstyle="butt",
linewidth=1.5,
color=hue,
label=name if kind == "training" else None,
ls=dict(training="-", validation="--")[kind],
)
handles, labels = ax.get_legend_handles_labels()
if "training" not in labels:
handles += [
plt.Line2D([0], [0], c="none", marker="none", ls="none"),
plt.Line2D([0], [0], c="k", ls="-"),
plt.Line2D([0], [0], c="k", ls="--"),
]
labels += ["", "training", "validation"]
ax.legend(handles, labels, loc="upper left", bbox_to_anchor=(1.05, 1.0))
ax.set(xlim=(0, None), ylim=(0.6, 3.0), xlabel="Steps", ylabel="Loss")
return ax
def train(model: nn.Module, **config_overrides: Any) -> pd.DataFrame:
if device == "cpu":
logging.warning(
"CPU does not have sufficient FLOP/s for tractable training. "
"Please try again using an IPU or GPU."
)
experiment_name = _gen_experiment_name(model) + config_overrides.pop(
"experiment_name_suffix", ""
)
if not os.path.exists(f"{data_dir}/train.bin"):
download_train_data()
cfg = _general_config_dict.copy()
cfg.update(
device=device,
experiment_name=experiment_name,
)
if "unit_scaled" in experiment_name:
cfg.update(learning_rate=0.02, min_lr=0.002)
cfg.update(config_overrides)
print(f"Training {experiment_name} ...")
results = run_training(model, cfg)
train_df = pd.DataFrame.from_dict(
{
"Steps": results["train"]["iters"],
"Loss": results["train"]["losses"],
}
)
valid_df = pd.DataFrame.from_dict(
{
"Steps": results["valid"]["iters"],
"Loss": results["valid"]["losses"],
}
)
train_df["Train/Valid"] = "training"
valid_df["Train/Valid"] = "validation"
df = pd.concat([train_df, valid_df])
df["Model"] = experiment_name
plot(df, experiment_name)
return df