diff --git a/scripts/train.py b/scripts/train.py index 03ffc77b..cab173bf 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,7 +1,8 @@ -from delphi.train.training import DDP,TrainingConfig, model_initialization, train_loop -from delphi.train.utils import load_config from argparse import ArgumentParser +from delphi.train.training import DDP, TrainingConfig, model_initialization, train_loop +from delphi.train.utils import load_config + def main(): parser = ArgumentParser() @@ -11,5 +12,5 @@ def main(): config = load_config(args.config) TrainConf = TrainingConfig(config) - model,model_args = model_initialization(config) - train_loop(model, TrainConf) \ No newline at end of file + model, model_args = model_initialization(config) + train_loop(model, TrainConf) diff --git a/scripts/upload_stories.py b/scripts/upload_stories.py index 6a420153..e1afc042 100644 --- a/scripts/upload_stories.py +++ b/scripts/upload_stories.py @@ -1,24 +1,20 @@ import json -import pandas as pd +import pandas as pd from datasets import Dataset - splits = [ ("../train/llama2c/data/TinyStoriesV2-GPT4-train-clean.json", "train"), - ("../train/llama2c/data/TinyStoriesV2-GPT4-valid-clean.json", "validation") + ("../train/llama2c/data/TinyStoriesV2-GPT4-valid-clean.json", "validation"), ] + def load_dataset(filepath): - with open(filepath, 'r', encoding='utf-8') as file: + with open(filepath, "r", encoding="utf-8") as file: return json.load(file) - - -for (filename, split) in splits: + + +for filename, split in splits: stories = load_dataset(filename) dataset = Dataset.from_pandas(pd.DataFrame(stories)) - dataset.push_to_hub( - repo_id="", - split=split, - token="" - ) \ No newline at end of file + dataset.push_to_hub(repo_id="", split=split, token="") diff --git a/scripts/upload_tokens.py b/scripts/upload_tokens.py index 33cd937b..d83f00e5 100644 --- a/scripts/upload_tokens.py +++ b/scripts/upload_tokens.py @@ -1,10 +1,10 @@ +from functools import partial + import pandas as pd from datasets import Dataset -from functools import partial from delphi import PretokDataset - batch_size = 1 max_seq_len = 512 vocab_size = 4096 @@ -12,7 +12,6 @@ device = "cuda" for split in ["train", "validation"]: - ds = PretokDataset( split=split, batch_size=batch_size, @@ -20,18 +19,14 @@ vocab_size=vocab_size, vocab_source=vocab_source, ) - + num_batches = len(PretokDataset) - + tokens = [] for idx, (chunk) in enumerate(ds): - if idx >= num_batches: + if idx >= num_batches: break - tokens.append({'tokens': chunk.numpy()}) - + tokens.append({"tokens": chunk.numpy()}) + dataset = Dataset.from_pandas(pd.DataFrame(tokens)) - dataset.push_to_hub( - repo_id="", - split=split, - token="" - ) + dataset.push_to_hub(repo_id="", split=split, token="") diff --git a/src/delphi/mamba.py b/src/delphi/mamba.py index 32257779..87710f6f 100644 --- a/src/delphi/mamba.py +++ b/src/delphi/mamba.py @@ -1,7 +1,9 @@ from dataclasses import dataclass + +import torch.nn.functional as F from mamba_ssm.models.config_mamba import MambaConfig from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel -import torch.nn.functional as F + @dataclass class MambaArgs(MambaConfig): @@ -9,7 +11,6 @@ class MambaArgs(MambaConfig): class Mamba(MambaLMHeadModel): - def __init__(self, params) -> None: super().__init__(params) @@ -20,6 +21,8 @@ def forward(self, input_ids, target_ids=None): """ hidden_states = self.backbone(input_ids) logits = self.lm_head(hidden_states) - self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_ids.view(-1), ignore_index=-1) - + self.last_loss = F.cross_entropy( + logits.view(-1, logits.size(-1)), target_ids.view(-1), ignore_index=-1 + ) + return logits diff --git a/src/delphi/train/__init__.py b/src/delphi/train/__init__.py index 4f4a3966..56407d22 100644 --- a/src/delphi/train/__init__.py +++ b/src/delphi/train/__init__.py @@ -1 +1 @@ -from .llama2c.tinystories import Task \ No newline at end of file +from .llama2c.tinystories import Task diff --git a/src/delphi/train/llama2.py b/src/delphi/train/llama2.py index ca204298..5d8e8625 100644 --- a/src/delphi/train/llama2.py +++ b/src/delphi/train/llama2.py @@ -9,6 +9,5 @@ class LLaMA2Args(ModelArgs): class LLaMA2(Transformer): - def __init__(self, params) -> None: super().__init__(params) diff --git a/src/delphi/train/training.py b/src/delphi/train/training.py index ecbd034e..c8b0c035 100644 --- a/src/delphi/train/training.py +++ b/src/delphi/train/training.py @@ -1,53 +1,61 @@ - -from datetime import datetime -from dataclasses import dataclass -import os -import torch +import math +import os import time from contextlib import nullcontext -import math +from dataclasses import dataclass +from datetime import datetime + +import torch + + @dataclass def TrainingConfig(config): # ----------------------------------------------------------------------------- # I/O out_dir: str = "out" eval_interval: int = 2000 - log_interval:int = 1 - eval_iters:int = 100 - eval_only:bool = False # if True, script exits right after the first eval - always_save_checkpoint:bool = False # if True, always save a checkpoint after each eval - init_from:bool = "scratch" # 'scratch' or 'resume' + log_interval: int = 1 + eval_iters: int = 100 + eval_only: bool = False # if True, script exits right after the first eval + always_save_checkpoint: bool = ( + False # if True, always save a checkpoint after each eval + ) + init_from: bool = "scratch" # 'scratch' or 'resume' # wandb logging - wandb_log:bool = False # disabled by default - wandb_project:str = "llamac" - wandb_run_name:str = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + wandb_log: bool = False # disabled by default + wandb_project: str = "llamac" + wandb_run_name: str = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") # data - batch_size:int = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size - max_seq_len:int = 256 - vocab_source:str = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained - vocab_size:str = 32000 # the Llama 2 tokenizer has 32K tokens + batch_size: int = ( + 128 # if gradient_accumulation_steps > 1, this is the micro-batch size + ) + max_seq_len: int = 256 + vocab_source: str = ( + "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained + ) + vocab_size: str = 32000 # the Llama 2 tokenizer has 32K tokens # model - dim:int = 288 - n_layers:int = 6 - n_heads:int = 6 - n_kv_heads:int = 6 - multiple_of:int = 32 - dropout:int = 0.0 + dim: int = 288 + n_layers: int = 6 + n_heads: int = 6 + n_kv_heads: int = 6 + multiple_of: int = 32 + dropout: int = 0.0 # adamw optimizer - gradient_accumulation_steps:int = 4 # used to simulate larger batch sizes - learning_rate:float = 5e-4 # max learning rate - max_iters:int = 100000 # total number of training iterations - weight_decay:float = 1e-1 - beta1:float = 0.9 - beta2:float = 0.95 - grad_clip:float = 1.0 # clip gradients at this value, or disable if == 0.0 + gradient_accumulation_steps: int = 4 # used to simulate larger batch sizes + learning_rate: float = 5e-4 # max learning rate + max_iters: int = 100000 # total number of training iterations + weight_decay: float = 1e-1 + beta1: float = 0.9 + beta2: float = 0.95 + grad_clip: float = 1.0 # clip gradients at this value, or disable if == 0.0 # learning rate decay settings - decay_lr:bool = True # whether to decay the learning rate - warmup_iters:int = 1000 # how many steps to warm up for + decay_lr: bool = True # whether to decay the learning rate + warmup_iters: int = 1000 # how many steps to warm up for # system - device:str = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks - dtype:str = "bfloat16" # float32|bfloat16|float16 - compile:bool = True # use PyTorch 2.0 to compile the model to be faster + device: str = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks + dtype: str = "bfloat16" # float32|bfloat16|float16 + compile: bool = True # use PyTorch 2.0 to compile the model to be faster # ----------------------------------------------------------------------------- config_keys = [ k @@ -57,7 +65,6 @@ def TrainingConfig(config): exec(open("configurator.py").read()) # overrides from command line or config file config = {k: globals()[k] for k in config_keys} # will be useful for logging - # ----------------------------------------------------------------------------- # fixing some hyperparams to sensible defaults @@ -66,7 +73,9 @@ def TrainingConfig(config): # validating checks assert vocab_source in ["llama2", "custom"] - assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens" + assert ( + vocab_source == "custom" or vocab_size == 32000 + ), "The vocab from Meta has 32K tokens" # various inits, derived attributes, I/O setup seed = 1337 @@ -74,10 +83,10 @@ def TrainingConfig(config): def model_initialization(config): - - #model + # model if config["model"] == "llama2": from delphi.models.llama2 import LLaMA2, LLaMA2Args + model_args = dict( dim=config["dim"], n_layers=config["n_layers"], @@ -92,6 +101,7 @@ def model_initialization(config): model = LLaMA2(gptconf) elif config["model"] == "mamba": from delphi.models.mamba import Mamba, MambaArgs + model_args = dict( dim=config["dim"], n_layers=config["n_layers"], @@ -99,16 +109,24 @@ def model_initialization(config): ) mambaconf = MambaArgs(**model_args) model = Mamba(mambaconf) - + if config["init_from"] == "resume": print(f"Resuming training from {config['out_dir']}") # resume training from a checkpoint. - ckpt_path = os.path.join(config['out_dir'], "ckpt.pt") - checkpoint = torch.load(ckpt_path, map_location=config['device']) + ckpt_path = os.path.join(config["out_dir"], "ckpt.pt") + checkpoint = torch.load(ckpt_path, map_location=config["device"]) checkpoint_model_args = checkpoint["model_args"] # force these config attributes to be equal otherwise we can't even resume training # the rest of the attributes (e.g. dropout) can stay as desired from command line - for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]: + for k in [ + "dim", + "n_layers", + "n_heads", + "n_kv_heads", + "vocab_size", + "multiple_of", + "max_seq_len", + ]: model_args[k] = checkpoint_model_args[k] # create the model state_dict = checkpoint["model"] @@ -121,22 +139,29 @@ def model_initialization(config): model.load_state_dict(state_dict) config.iter_num = checkpoint["iter_num"] config.best_val_loss = checkpoint["best_val_loss"] - + model.to(config["device"]) # compile the model if config["compile"]: print("compiling the model... (takes a ~minute)") unoptimized_model = model model = torch.compile(model) # requires PyTorch 2.0 - return model,model_args + return model, model_args + def train_loop(model, TrainConf): torch.manual_seed(TrainConf.seed) torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn - device_type = "cuda" if "cuda" in TrainConf.device else "cpu" # for later use in torch.autocast + device_type = ( + "cuda" if "cuda" in TrainConf.device else "cpu" + ) # for later use in torch.autocast # note: float16 data type will automatically use a GradScaler - ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[TrainConf.dtype] + ptdtype = { + "float32": torch.float32, + "bfloat16": torch.bfloat16, + "float16": torch.float16, + }[TrainConf.dtype] ctx = ( nullcontext() if device_type == "cpu" @@ -145,14 +170,24 @@ def train_loop(model, TrainConf): scaler = torch.cuda.amp.GradScaler(enabled=(TrainConf.dtype == "float16")) # optimizer - optimizer = model.configure_optimizers(TrainConf.weight_decay, TrainConf.learning_rate, (TrainConf.beta1, TrainConf.beta2), device_type) + optimizer = model.configure_optimizers( + TrainConf.weight_decay, + TrainConf.learning_rate, + (TrainConf.beta1, TrainConf.beta2), + device_type, + ) if TrainConf.init_from == "resume" and "optimizer" in checkpoint: optimizer.load_state_dict(checkpoint["optimizer"]) checkpoint = None # free up memory if TrainConf.wandb_log: import wandb - wandb.init(project=TrainConf.wandb_project, name=TrainConf.wandb_run_name, config=TrainConf.config) + + wandb.init( + project=TrainConf.wandb_project, + name=TrainConf.wandb_run_name, + config=TrainConf.config, + ) train_batch_iter = TrainConf.iter_batches(split="train") X, Y = next(train_batch_iter) # fetch the very first batch @@ -162,14 +197,20 @@ def train_loop(model, TrainConf): running_mfu = -1.0 while True: # determine and set the learning rate for this iteration - lr = get_lr(iter_num,TrainConf) if TrainConf.decay_lr else TrainConf.learning_rate + lr = ( + get_lr(iter_num, TrainConf) + if TrainConf.decay_lr + else TrainConf.learning_rate + ) for param_group in optimizer.param_groups: param_group["lr"] = lr # evaluate the loss on train/val sets and write checkpoints if iter_num % TrainConf.eval_interval == 0: losses = estimate_loss() - print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + print( + f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}" + ) if TrainConf.wandb_log: try: wandb.log( @@ -180,7 +221,8 @@ def train_loop(model, TrainConf): "loss/val": losses["val"], "lr": lr, "mfu": running_mfu * 100, # convert to percentage - }, step = iter_num + }, + step=iter_num, ) except Exception as e: print(f"logging to wandb failed: {e}") @@ -197,7 +239,9 @@ def train_loop(model, TrainConf): } print(f"saving checkpoint to {out_dir}") torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt")) - model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0) + model_export( + raw_model, os.path.join(out_dir, "model.bin"), version=0 + ) if iter_num == 0 and eval_only: break @@ -230,8 +274,12 @@ def train_loop(model, TrainConf): # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point lossf = loss.item() * gradient_accumulation_steps if local_iter_num >= 5: # let the training loop settle a bit - mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) - running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + mfu = raw_model.estimate_mfu( + batch_size * gradient_accumulation_steps, dt + ) + running_mfu = ( + mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + ) print( f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%" ) @@ -242,6 +290,7 @@ def train_loop(model, TrainConf): if iter_num > max_iters: break + @torch.no_grad() def estimate_loss(): out = {} @@ -259,8 +308,9 @@ def estimate_loss(): model.train() return out + # learning rate decay scheduler (cosine with warmup) -def get_lr(it,TrainConf): +def get_lr(it, TrainConf): # 1) linear warmup for warmup_iters steps if it < TrainConf.warmup_iters: return TrainConf.learning_rate * it / TrainConf.warmup_iters @@ -268,9 +318,9 @@ def get_lr(it,TrainConf): if it > TrainConf.lr_decay_iters: return TrainConf.min_lr # 3) in between, use cosine decay down to min learning rate - decay_ratio = (it - TrainConf.warmup_iters) / (TrainConf.lr_decay_iters - TrainConf.warmup_iters) + decay_ratio = (it - TrainConf.warmup_iters) / ( + TrainConf.lr_decay_iters - TrainConf.warmup_iters + ) assert 0 <= decay_ratio <= 1 coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 return TrainConf.min_lr + coeff * (TrainConf.learning_rate - TrainConf.min_lr) - - diff --git a/src/delphi/train/training_old.py b/src/delphi/train/training_old.py index bd1a6a9d..2802d2af 100644 --- a/src/delphi/train/training_old.py +++ b/src/delphi/train/training_old.py @@ -17,6 +17,7 @@ """ import torch._dynamo + torch._dynamo.config.suppress_errors = True import math @@ -27,11 +28,10 @@ from functools import partial import torch +from llama2 import LLaMA2, LLaMA2Args +from llama2c import Task, model_export from torch.distributed import destroy_process_group, init_process_group from torch.nn.parallel import DistributedDataParallel as DDP - -from llama2 import LLaMA2, LLaMA2Args -from llama2c import model_export, Task from tqdm import tqdm # ----------------------------------------------------------------------------- @@ -51,8 +51,10 @@ # data batch_size = 64 # if gradient_accumulation_steps > 1, this is the micro-batch size max_seq_len = 256 -vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained -vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens +vocab_source = ( + "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained +) +vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens # model dim = 288 n_layers = 6 @@ -72,7 +74,9 @@ decay_lr = True # whether to decay the learning rate warmup_iters = 1000 # how many steps to warm up for # system -device = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks +device = ( + "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks +) dtype = "bfloat16" # float32|bfloat16|float16 compile = False # use PyTorch 2.0 to compile the model to be faster # ----------------------------------------------------------------------------- @@ -81,7 +85,9 @@ for k, v in globals().items() if not k.startswith("_") and isinstance(v, (int, float, bool, str)) ] -exec(open("./llama2c/configurator.py").read()) # overrides from command line or config file +exec( + open("./llama2c/configurator.py").read() +) # overrides from command line or config file config = {k: globals()[k] for k in config_keys} # will be useful for logging @@ -110,7 +116,9 @@ # validating checks assert vocab_source in ["llama2", "custom"] -assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens" +assert ( + vocab_source == "custom" or vocab_size == 32000 +), "The vocab from Meta has 32K tokens" # various inits, derived attributes, I/O setup seed = 1337 @@ -133,10 +141,14 @@ master_process = True seed_offset = 0 ddp_world_size = 1 -tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len +tokens_per_iter = ( + gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len +) if master_process: print(f"tokens per iteration will be: {tokens_per_iter:,}") - print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len") + print( + f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len" + ) if master_process: os.makedirs(out_dir, exist_ok=True) @@ -145,7 +157,11 @@ torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast # note: float16 data type will automatically use a GradScaler -ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype] +ptdtype = { + "float32": torch.float32, + "bfloat16": torch.bfloat16, + "float16": torch.float16, +}[dtype] ctx = ( nullcontext() if device_type == "cpu" @@ -161,7 +177,7 @@ vocab_source=vocab_source, device=device, num_workers=0, - seed=seed + seed=seed, ) # init these up here, can override if init_from='resume' (i.e. from a checkpoint) @@ -192,7 +208,15 @@ checkpoint_model_args = checkpoint["model_args"] # force these config attributes to be equal otherwise we can't even resume training # the rest of the attributes (e.g. dropout) can stay as desired from command line - for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]: + for k in [ + "dim", + "n_layers", + "n_heads", + "n_kv_heads", + "vocab_size", + "multiple_of", + "max_seq_len", + ]: model_args[k] = checkpoint_model_args[k] # create the model gptconf = LLaMA2Args(**model_args) @@ -213,7 +237,9 @@ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16")) # optimizer -optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) +optimizer = model.configure_optimizers( + weight_decay, learning_rate, (beta1, beta2), device_type +) if init_from == "resume" and "optimizer" in checkpoint: optimizer.load_state_dict(checkpoint["optimizer"]) checkpoint = None # free up memory @@ -232,6 +258,7 @@ model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"} model = DDP(model, device_ids=[ddp_local_rank]) + # helps estimate an arbitrarily accurate loss over either split using many batches @torch.no_grad() def estimate_loss(): @@ -250,6 +277,7 @@ def estimate_loss(): model.train() return out + # learning rate decay scheduler (cosine with warmup) def get_lr(it): # 1) linear warmup for warmup_iters steps @@ -264,10 +292,14 @@ def get_lr(it): coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 return min_lr + coeff * (learning_rate - min_lr) + # logging if wandb_log and master_process: import wandb - wandb.init(entity=wandb_entity, project=wandb_project, name=wandb_run_name, config=config) + + wandb.init( + entity=wandb_entity, project=wandb_project, name=wandb_run_name, config=config + ) # training loop t0 = time.time() @@ -277,9 +309,8 @@ def get_lr(it): epoch = 0 for epoch in range(max_epochs): train_batch_iter = iter_batches(split="train", epoch=epoch) - X, Y = next(train_batch_iter) # fetch the very first batch + X, Y = next(train_batch_iter) # fetch the very first batch for _ in tqdm(range(num_steps)): - # determine and set the learning rate for this iteration lr = get_lr(iter_num) if decay_lr else learning_rate for param_group in optimizer.param_groups: @@ -288,7 +319,9 @@ def get_lr(it): # evaluate the loss on train/val sets and write checkpoints if iter_num % eval_interval == 0 and master_process: losses = estimate_loss() - print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") + print( + f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}" + ) if wandb_log: try: wandb.log( @@ -299,7 +332,8 @@ def get_lr(it): "loss/val": losses["val"], "lr": lr, "mfu": running_mfu * 100, # convert to percentage - }, step = iter_num + }, + step=iter_num, ) except Exception as e: print(f"logging to wandb failed: {e}") @@ -316,7 +350,9 @@ def get_lr(it): } print(f"saving checkpoint to {out_dir}") torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt")) - model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0) + model_export( + raw_model, os.path.join(out_dir, "model.bin"), version=0 + ) if iter_num == 0 and eval_only: break @@ -328,7 +364,9 @@ def get_lr(it): # the official way to do this is with model.no_sync() context manager, but # I really dislike that this bloats the code and forces us to repeat code # looking at the source of that context manager, it just toggles this variable - model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1 + model.require_backward_grad_sync = ( + micro_step == gradient_accumulation_steps - 1 + ) with ctx: logits = model(X, Y) loss = raw_model.last_loss @@ -355,8 +393,12 @@ def get_lr(it): # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point lossf = loss.item() * gradient_accumulation_steps if local_iter_num >= 5: # let the training loop settle a bit - mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) - running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + mfu = raw_model.estimate_mfu( + batch_size * gradient_accumulation_steps, dt + ) + running_mfu = ( + mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu + ) print( f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%" ) diff --git a/src/delphi/train/utils.py b/src/delphi/train/utils.py index 73c3e9e6..0a62c863 100644 --- a/src/delphi/train/utils.py +++ b/src/delphi/train/utils.py @@ -1,6 +1,6 @@ import json + def load_config(config_path): - with open(config_path, 'r') as file: + with open(config_path, "r") as file: return json.load(file) - \ No newline at end of file