diff --git a/scripts/train.py b/scripts/train.py
index 03ffc77b..cab173bf 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,7 +1,8 @@
-from delphi.train.training import DDP,TrainingConfig, model_initialization, train_loop
-from delphi.train.utils import load_config
 from argparse import ArgumentParser
 
+from delphi.train.training import DDP, TrainingConfig, model_initialization, train_loop
+from delphi.train.utils import load_config
+
 
 def main():
     parser = ArgumentParser()
@@ -11,5 +12,5 @@ def main():
 
     config = load_config(args.config)
     TrainConf = TrainingConfig(config)
-    model,model_args = model_initialization(config)
-    train_loop(model, TrainConf)
\ No newline at end of file
+    model, model_args = model_initialization(config)
+    train_loop(model, TrainConf)
diff --git a/scripts/upload_stories.py b/scripts/upload_stories.py
index 6a420153..e1afc042 100644
--- a/scripts/upload_stories.py
+++ b/scripts/upload_stories.py
@@ -1,24 +1,20 @@
 import json
-import pandas as pd
 
+import pandas as pd
 from datasets import Dataset
 
-
 splits = [
     ("../train/llama2c/data/TinyStoriesV2-GPT4-train-clean.json", "train"),
-    ("../train/llama2c/data/TinyStoriesV2-GPT4-valid-clean.json", "validation")
+    ("../train/llama2c/data/TinyStoriesV2-GPT4-valid-clean.json", "validation"),
 ]
 
+
 def load_dataset(filepath):
-    with open(filepath, 'r', encoding='utf-8') as file:
+    with open(filepath, "r", encoding="utf-8") as file:
         return json.load(file)
-    
-    
-for (filename, split) in splits:
+
+
+for filename, split in splits:
     stories = load_dataset(filename)
     dataset = Dataset.from_pandas(pd.DataFrame(stories))
-    dataset.push_to_hub(
-        repo_id="",
-        split=split,
-        token=""
-    )
\ No newline at end of file
+    dataset.push_to_hub(repo_id="", split=split, token="")
diff --git a/scripts/upload_tokens.py b/scripts/upload_tokens.py
index 33cd937b..d83f00e5 100644
--- a/scripts/upload_tokens.py
+++ b/scripts/upload_tokens.py
@@ -1,10 +1,10 @@
+from functools import partial
+
 import pandas as pd
 from datasets import Dataset
-from functools import partial
 
 from delphi import PretokDataset
 
-
 batch_size = 1
 max_seq_len = 512
 vocab_size = 4096
@@ -12,7 +12,6 @@
 device = "cuda"
 
 for split in ["train", "validation"]:
-
     ds = PretokDataset(
         split=split,
         batch_size=batch_size,
@@ -20,18 +19,14 @@
         vocab_size=vocab_size,
         vocab_source=vocab_source,
     )
-    
+
     num_batches = len(PretokDataset)
-    
+
     tokens = []
     for idx, (chunk) in enumerate(ds):
-        if idx >= num_batches: 
+        if idx >= num_batches:
             break
-        tokens.append({'tokens': chunk.numpy()})
-        
+        tokens.append({"tokens": chunk.numpy()})
+
     dataset = Dataset.from_pandas(pd.DataFrame(tokens))
-    dataset.push_to_hub(
-        repo_id="",
-        split=split,
-        token=""
-    )
+    dataset.push_to_hub(repo_id="", split=split, token="")
diff --git a/src/delphi/mamba.py b/src/delphi/mamba.py
index 32257779..87710f6f 100644
--- a/src/delphi/mamba.py
+++ b/src/delphi/mamba.py
@@ -1,7 +1,9 @@
 from dataclasses import dataclass
+
+import torch.nn.functional as F
 from mamba_ssm.models.config_mamba import MambaConfig
 from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
-import torch.nn.functional as F
+
 
 @dataclass
 class MambaArgs(MambaConfig):
@@ -9,7 +11,6 @@ class MambaArgs(MambaConfig):
 
 
 class Mamba(MambaLMHeadModel):
-    
     def __init__(self, params) -> None:
         super().__init__(params)
 
@@ -20,6 +21,8 @@ def forward(self, input_ids, target_ids=None):
         """
         hidden_states = self.backbone(input_ids)
         logits = self.lm_head(hidden_states)
-        self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_ids.view(-1), ignore_index=-1)
-        
+        self.last_loss = F.cross_entropy(
+            logits.view(-1, logits.size(-1)), target_ids.view(-1), ignore_index=-1
+        )
+
         return logits
diff --git a/src/delphi/train/__init__.py b/src/delphi/train/__init__.py
index 4f4a3966..56407d22 100644
--- a/src/delphi/train/__init__.py
+++ b/src/delphi/train/__init__.py
@@ -1 +1 @@
-from .llama2c.tinystories import Task
\ No newline at end of file
+from .llama2c.tinystories import Task
diff --git a/src/delphi/train/llama2.py b/src/delphi/train/llama2.py
index ca204298..5d8e8625 100644
--- a/src/delphi/train/llama2.py
+++ b/src/delphi/train/llama2.py
@@ -9,6 +9,5 @@ class LLaMA2Args(ModelArgs):
 
 
 class LLaMA2(Transformer):
-    
     def __init__(self, params) -> None:
         super().__init__(params)
diff --git a/src/delphi/train/training.py b/src/delphi/train/training.py
index ecbd034e..c8b0c035 100644
--- a/src/delphi/train/training.py
+++ b/src/delphi/train/training.py
@@ -1,53 +1,61 @@
-
-from datetime import datetime
-from dataclasses import dataclass
-import os 
-import torch
+import math
+import os
 import time
 from contextlib import nullcontext
-import math
+from dataclasses import dataclass
+from datetime import datetime
+
+import torch
+
+
 @dataclass
 def TrainingConfig(config):
     # -----------------------------------------------------------------------------
     # I/O
     out_dir: str = "out"
     eval_interval: int = 2000
-    log_interval:int = 1
-    eval_iters:int = 100
-    eval_only:bool = False  # if True, script exits right after the first eval
-    always_save_checkpoint:bool = False  # if True, always save a checkpoint after each eval
-    init_from:bool = "scratch"  # 'scratch' or 'resume'
+    log_interval: int = 1
+    eval_iters: int = 100
+    eval_only: bool = False  # if True, script exits right after the first eval
+    always_save_checkpoint: bool = (
+        False  # if True, always save a checkpoint after each eval
+    )
+    init_from: bool = "scratch"  # 'scratch' or 'resume'
     # wandb logging
-    wandb_log:bool = False  # disabled by default
-    wandb_project:str = "llamac"
-    wandb_run_name:str = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+    wandb_log: bool = False  # disabled by default
+    wandb_project: str = "llamac"
+    wandb_run_name: str = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
     # data
-    batch_size:int = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
-    max_seq_len:int = 256
-    vocab_source:str = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-    vocab_size:str = 32000 # the Llama 2 tokenizer has 32K tokens
+    batch_size: int = (
+        128  # if gradient_accumulation_steps > 1, this is the micro-batch size
+    )
+    max_seq_len: int = 256
+    vocab_source: str = (
+        "llama2"  # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+    )
+    vocab_size: str = 32000  # the Llama 2 tokenizer has 32K tokens
     # model
-    dim:int = 288
-    n_layers:int = 6
-    n_heads:int = 6
-    n_kv_heads:int = 6
-    multiple_of:int = 32
-    dropout:int = 0.0
+    dim: int = 288
+    n_layers: int = 6
+    n_heads: int = 6
+    n_kv_heads: int = 6
+    multiple_of: int = 32
+    dropout: int = 0.0
     # adamw optimizer
-    gradient_accumulation_steps:int = 4  # used to simulate larger batch sizes
-    learning_rate:float = 5e-4  # max learning rate
-    max_iters:int = 100000  # total number of training iterations
-    weight_decay:float = 1e-1
-    beta1:float = 0.9
-    beta2:float = 0.95
-    grad_clip:float = 1.0  # clip gradients at this value, or disable if == 0.0
+    gradient_accumulation_steps: int = 4  # used to simulate larger batch sizes
+    learning_rate: float = 5e-4  # max learning rate
+    max_iters: int = 100000  # total number of training iterations
+    weight_decay: float = 1e-1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0  # clip gradients at this value, or disable if == 0.0
     # learning rate decay settings
-    decay_lr:bool = True  # whether to decay the learning rate
-    warmup_iters:int = 1000  # how many steps to warm up for
+    decay_lr: bool = True  # whether to decay the learning rate
+    warmup_iters: int = 1000  # how many steps to warm up for
     # system
-    device:str = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
-    dtype:str = "bfloat16"  # float32|bfloat16|float16
-    compile:bool = True  # use PyTorch 2.0 to compile the model to be faster
+    device: str = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+    dtype: str = "bfloat16"  # float32|bfloat16|float16
+    compile: bool = True  # use PyTorch 2.0 to compile the model to be faster
     # -----------------------------------------------------------------------------
     config_keys = [
         k
@@ -57,7 +65,6 @@ def TrainingConfig(config):
     exec(open("configurator.py").read())  # overrides from command line or config file
     config = {k: globals()[k] for k in config_keys}  # will be useful for logging
 
-
     # -----------------------------------------------------------------------------
 
     # fixing some hyperparams to sensible defaults
@@ -66,7 +73,9 @@ def TrainingConfig(config):
 
     # validating checks
     assert vocab_source in ["llama2", "custom"]
-    assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
+    assert (
+        vocab_source == "custom" or vocab_size == 32000
+    ), "The vocab from Meta has 32K tokens"
 
     # various inits, derived attributes, I/O setup
     seed = 1337
@@ -74,10 +83,10 @@ def TrainingConfig(config):
 
 
 def model_initialization(config):
-
-    #model 
+    # model
     if config["model"] == "llama2":
         from delphi.models.llama2 import LLaMA2, LLaMA2Args
+
         model_args = dict(
             dim=config["dim"],
             n_layers=config["n_layers"],
@@ -92,6 +101,7 @@ def model_initialization(config):
         model = LLaMA2(gptconf)
     elif config["model"] == "mamba":
         from delphi.models.mamba import Mamba, MambaArgs
+
         model_args = dict(
             dim=config["dim"],
             n_layers=config["n_layers"],
@@ -99,16 +109,24 @@ def model_initialization(config):
         )
         mambaconf = MambaArgs(**model_args)
         model = Mamba(mambaconf)
-   
+
     if config["init_from"] == "resume":
         print(f"Resuming training from {config['out_dir']}")
         # resume training from a checkpoint.
-        ckpt_path = os.path.join(config['out_dir'], "ckpt.pt")
-        checkpoint = torch.load(ckpt_path, map_location=config['device'])
+        ckpt_path = os.path.join(config["out_dir"], "ckpt.pt")
+        checkpoint = torch.load(ckpt_path, map_location=config["device"])
         checkpoint_model_args = checkpoint["model_args"]
         # force these config attributes to be equal otherwise we can't even resume training
         # the rest of the attributes (e.g. dropout) can stay as desired from command line
-        for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
+        for k in [
+            "dim",
+            "n_layers",
+            "n_heads",
+            "n_kv_heads",
+            "vocab_size",
+            "multiple_of",
+            "max_seq_len",
+        ]:
             model_args[k] = checkpoint_model_args[k]
         # create the model
         state_dict = checkpoint["model"]
@@ -121,22 +139,29 @@ def model_initialization(config):
         model.load_state_dict(state_dict)
         config.iter_num = checkpoint["iter_num"]
         config.best_val_loss = checkpoint["best_val_loss"]
-    
+
     model.to(config["device"])
     # compile the model
     if config["compile"]:
         print("compiling the model... (takes a ~minute)")
         unoptimized_model = model
         model = torch.compile(model)  # requires PyTorch 2.0
-    return model,model_args
+    return model, model_args
+
 
 def train_loop(model, TrainConf):
     torch.manual_seed(TrainConf.seed)
     torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
     torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
-    device_type = "cuda" if "cuda" in TrainConf.device else "cpu"  # for later use in torch.autocast
+    device_type = (
+        "cuda" if "cuda" in TrainConf.device else "cpu"
+    )  # for later use in torch.autocast
     # note: float16 data type will automatically use a GradScaler
-    ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[TrainConf.dtype]
+    ptdtype = {
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+    }[TrainConf.dtype]
     ctx = (
         nullcontext()
         if device_type == "cpu"
@@ -145,14 +170,24 @@ def train_loop(model, TrainConf):
     scaler = torch.cuda.amp.GradScaler(enabled=(TrainConf.dtype == "float16"))
 
     # optimizer
-    optimizer = model.configure_optimizers(TrainConf.weight_decay, TrainConf.learning_rate, (TrainConf.beta1, TrainConf.beta2), device_type)
+    optimizer = model.configure_optimizers(
+        TrainConf.weight_decay,
+        TrainConf.learning_rate,
+        (TrainConf.beta1, TrainConf.beta2),
+        device_type,
+    )
     if TrainConf.init_from == "resume" and "optimizer" in checkpoint:
         optimizer.load_state_dict(checkpoint["optimizer"])
     checkpoint = None  # free up memory
 
     if TrainConf.wandb_log:
         import wandb
-        wandb.init(project=TrainConf.wandb_project, name=TrainConf.wandb_run_name, config=TrainConf.config)
+
+        wandb.init(
+            project=TrainConf.wandb_project,
+            name=TrainConf.wandb_run_name,
+            config=TrainConf.config,
+        )
 
     train_batch_iter = TrainConf.iter_batches(split="train")
     X, Y = next(train_batch_iter)  # fetch the very first batch
@@ -162,14 +197,20 @@ def train_loop(model, TrainConf):
     running_mfu = -1.0
     while True:
         # determine and set the learning rate for this iteration
-        lr = get_lr(iter_num,TrainConf) if TrainConf.decay_lr else TrainConf.learning_rate
+        lr = (
+            get_lr(iter_num, TrainConf)
+            if TrainConf.decay_lr
+            else TrainConf.learning_rate
+        )
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr
 
         # evaluate the loss on train/val sets and write checkpoints
         if iter_num % TrainConf.eval_interval == 0:
             losses = estimate_loss()
-            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+            print(
+                f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+            )
             if TrainConf.wandb_log:
                 try:
                     wandb.log(
@@ -180,7 +221,8 @@ def train_loop(model, TrainConf):
                             "loss/val": losses["val"],
                             "lr": lr,
                             "mfu": running_mfu * 100,  # convert to percentage
-                        }, step = iter_num
+                        },
+                        step=iter_num,
                     )
                 except Exception as e:
                     print(f"logging to wandb failed: {e}")
@@ -197,7 +239,9 @@ def train_loop(model, TrainConf):
                     }
                     print(f"saving checkpoint to {out_dir}")
                     torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
-                    model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0)
+                    model_export(
+                        raw_model, os.path.join(out_dir, "model.bin"), version=0
+                    )
         if iter_num == 0 and eval_only:
             break
 
@@ -230,8 +274,12 @@ def train_loop(model, TrainConf):
             # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
             lossf = loss.item() * gradient_accumulation_steps
             if local_iter_num >= 5:  # let the training loop settle a bit
-                mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
-                running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                mfu = raw_model.estimate_mfu(
+                    batch_size * gradient_accumulation_steps, dt
+                )
+                running_mfu = (
+                    mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                )
             print(
                 f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
             )
@@ -242,6 +290,7 @@ def train_loop(model, TrainConf):
         if iter_num > max_iters:
             break
 
+
 @torch.no_grad()
 def estimate_loss():
     out = {}
@@ -259,8 +308,9 @@ def estimate_loss():
     model.train()
     return out
 
+
 # learning rate decay scheduler (cosine with warmup)
-def get_lr(it,TrainConf):
+def get_lr(it, TrainConf):
     # 1) linear warmup for warmup_iters steps
     if it < TrainConf.warmup_iters:
         return TrainConf.learning_rate * it / TrainConf.warmup_iters
@@ -268,9 +318,9 @@ def get_lr(it,TrainConf):
     if it > TrainConf.lr_decay_iters:
         return TrainConf.min_lr
     # 3) in between, use cosine decay down to min learning rate
-    decay_ratio = (it - TrainConf.warmup_iters) / (TrainConf.lr_decay_iters - TrainConf.warmup_iters)
+    decay_ratio = (it - TrainConf.warmup_iters) / (
+        TrainConf.lr_decay_iters - TrainConf.warmup_iters
+    )
     assert 0 <= decay_ratio <= 1
     coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
     return TrainConf.min_lr + coeff * (TrainConf.learning_rate - TrainConf.min_lr)
-
-
diff --git a/src/delphi/train/training_old.py b/src/delphi/train/training_old.py
index bd1a6a9d..2802d2af 100644
--- a/src/delphi/train/training_old.py
+++ b/src/delphi/train/training_old.py
@@ -17,6 +17,7 @@
 """
 
 import torch._dynamo
+
 torch._dynamo.config.suppress_errors = True
 
 import math
@@ -27,11 +28,10 @@
 from functools import partial
 
 import torch
+from llama2 import LLaMA2, LLaMA2Args
+from llama2c import Task, model_export
 from torch.distributed import destroy_process_group, init_process_group
 from torch.nn.parallel import DistributedDataParallel as DDP
-
-from llama2 import LLaMA2, LLaMA2Args
-from llama2c import model_export, Task
 from tqdm import tqdm
 
 # -----------------------------------------------------------------------------
@@ -51,8 +51,10 @@
 # data
 batch_size = 64  # if gradient_accumulation_steps > 1, this is the micro-batch size
 max_seq_len = 256
-vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
+vocab_source = (
+    "llama2"  # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+)
+vocab_size = 32000  # the Llama 2 tokenizer has 32K tokens
 # model
 dim = 288
 n_layers = 6
@@ -72,7 +74,9 @@
 decay_lr = True  # whether to decay the learning rate
 warmup_iters = 1000  # how many steps to warm up for
 # system
-device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+device = (
+    "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+)
 dtype = "bfloat16"  # float32|bfloat16|float16
 compile = False  # use PyTorch 2.0 to compile the model to be faster
 # -----------------------------------------------------------------------------
@@ -81,7 +85,9 @@
     for k, v in globals().items()
     if not k.startswith("_") and isinstance(v, (int, float, bool, str))
 ]
-exec(open("./llama2c/configurator.py").read())  # overrides from command line or config file
+exec(
+    open("./llama2c/configurator.py").read()
+)  # overrides from command line or config file
 config = {k: globals()[k] for k in config_keys}  # will be useful for logging
 
 
@@ -110,7 +116,9 @@
 
 # validating checks
 assert vocab_source in ["llama2", "custom"]
-assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
+assert (
+    vocab_source == "custom" or vocab_size == 32000
+), "The vocab from Meta has 32K tokens"
 
 # various inits, derived attributes, I/O setup
 seed = 1337
@@ -133,10 +141,14 @@
     master_process = True
     seed_offset = 0
     ddp_world_size = 1
-tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
+tokens_per_iter = (
+    gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
+)
 if master_process:
     print(f"tokens per iteration will be: {tokens_per_iter:,}")
-    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
+    print(
+        f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len"
+    )
 
 if master_process:
     os.makedirs(out_dir, exist_ok=True)
@@ -145,7 +157,11 @@
 torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
 device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
 # note: float16 data type will automatically use a GradScaler
-ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
+ptdtype = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+}[dtype]
 ctx = (
     nullcontext()
     if device_type == "cpu"
@@ -161,7 +177,7 @@
     vocab_source=vocab_source,
     device=device,
     num_workers=0,
-    seed=seed
+    seed=seed,
 )
 
 # init these up here, can override if init_from='resume' (i.e. from a checkpoint)
@@ -192,7 +208,15 @@
     checkpoint_model_args = checkpoint["model_args"]
     # force these config attributes to be equal otherwise we can't even resume training
     # the rest of the attributes (e.g. dropout) can stay as desired from command line
-    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
+    for k in [
+        "dim",
+        "n_layers",
+        "n_heads",
+        "n_kv_heads",
+        "vocab_size",
+        "multiple_of",
+        "max_seq_len",
+    ]:
         model_args[k] = checkpoint_model_args[k]
     # create the model
     gptconf = LLaMA2Args(**model_args)
@@ -213,7 +237,9 @@
 scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))
 
 # optimizer
-optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
+optimizer = model.configure_optimizers(
+    weight_decay, learning_rate, (beta1, beta2), device_type
+)
 if init_from == "resume" and "optimizer" in checkpoint:
     optimizer.load_state_dict(checkpoint["optimizer"])
 checkpoint = None  # free up memory
@@ -232,6 +258,7 @@
     model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
     model = DDP(model, device_ids=[ddp_local_rank])
 
+
 # helps estimate an arbitrarily accurate loss over either split using many batches
 @torch.no_grad()
 def estimate_loss():
@@ -250,6 +277,7 @@ def estimate_loss():
     model.train()
     return out
 
+
 # learning rate decay scheduler (cosine with warmup)
 def get_lr(it):
     # 1) linear warmup for warmup_iters steps
@@ -264,10 +292,14 @@ def get_lr(it):
     coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
     return min_lr + coeff * (learning_rate - min_lr)
 
+
 # logging
 if wandb_log and master_process:
     import wandb
-    wandb.init(entity=wandb_entity, project=wandb_project, name=wandb_run_name, config=config)
+
+    wandb.init(
+        entity=wandb_entity, project=wandb_project, name=wandb_run_name, config=config
+    )
 
 # training loop
 t0 = time.time()
@@ -277,9 +309,8 @@ def get_lr(it):
 epoch = 0
 for epoch in range(max_epochs):
     train_batch_iter = iter_batches(split="train", epoch=epoch)
-    X, Y = next(train_batch_iter) # fetch the very first batch
+    X, Y = next(train_batch_iter)  # fetch the very first batch
     for _ in tqdm(range(num_steps)):
-        
         # determine and set the learning rate for this iteration
         lr = get_lr(iter_num) if decay_lr else learning_rate
         for param_group in optimizer.param_groups:
@@ -288,7 +319,9 @@ def get_lr(it):
         # evaluate the loss on train/val sets and write checkpoints
         if iter_num % eval_interval == 0 and master_process:
             losses = estimate_loss()
-            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+            print(
+                f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+            )
             if wandb_log:
                 try:
                     wandb.log(
@@ -299,7 +332,8 @@ def get_lr(it):
                             "loss/val": losses["val"],
                             "lr": lr,
                             "mfu": running_mfu * 100,  # convert to percentage
-                        }, step = iter_num
+                        },
+                        step=iter_num,
                     )
                 except Exception as e:
                     print(f"logging to wandb failed: {e}")
@@ -316,7 +350,9 @@ def get_lr(it):
                     }
                     print(f"saving checkpoint to {out_dir}")
                     torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
-                    model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0)
+                    model_export(
+                        raw_model, os.path.join(out_dir, "model.bin"), version=0
+                    )
         if iter_num == 0 and eval_only:
             break
 
@@ -328,7 +364,9 @@ def get_lr(it):
                 # the official way to do this is with model.no_sync() context manager, but
                 # I really dislike that this bloats the code and forces us to repeat code
                 # looking at the source of that context manager, it just toggles this variable
-                model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1
+                model.require_backward_grad_sync = (
+                    micro_step == gradient_accumulation_steps - 1
+                )
             with ctx:
                 logits = model(X, Y)
                 loss = raw_model.last_loss
@@ -355,8 +393,12 @@ def get_lr(it):
             # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
             lossf = loss.item() * gradient_accumulation_steps
             if local_iter_num >= 5:  # let the training loop settle a bit
-                mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
-                running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                mfu = raw_model.estimate_mfu(
+                    batch_size * gradient_accumulation_steps, dt
+                )
+                running_mfu = (
+                    mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                )
             print(
                 f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
             )
diff --git a/src/delphi/train/utils.py b/src/delphi/train/utils.py
index 73c3e9e6..0a62c863 100644
--- a/src/delphi/train/utils.py
+++ b/src/delphi/train/utils.py
@@ -1,6 +1,6 @@
 import json
 
+
 def load_config(config_path):
-    with open(config_path, 'r') as file:
+    with open(config_path, "r") as file:
         return json.load(file)
-    
\ No newline at end of file