llama2 & mamba training configs (#113)

* Llama 2 example scripts * Mamba example scripts * Added all training config sizes Added an example Llama2 base, Mamba base and all sizes in their respective folders. * Moving and reformating configs * Llama 2 example scripts * Mamba example scripts * Cleaning extra files * remove src/llama2c * moved/renamed configs * bos/eos token ids * updated configs following the meeting * static stuff * updated base configs and simplified config structure * grad_acc_steps & batch_size fix * add @beartype to config classes that don't have it yet * don't ignore incorrect config keys * gradient_accumulation_steps fix * fix broken test config * Updating test configs to work with recent changes * config testing and fixes * re-adding accidentally deleted test config * fix tests that broke after recent changes * remove minibatch divisibility requirement * estimate_loss returns float, not tensor * log train/validation dataset size when training starts * fix incorrect train split default * Updating log spaced checkpoints and checkpointing intervals * update llama2 configs * stories cfgs: checkpoints, evals, bos & eos --------- Co-authored-by: Jett <[email protected]> Co-authored-by: JaiDhyani <[email protected]> Co-authored-by: Jannik Brinkmann <[email protected]>
delphi-suite · Apr 24, 2024 · 262972b · 262972b
1 parent ad2936f
commit 262972b
Show file tree

Hide file tree

Showing 69 changed files with 531 additions and 516 deletions.
diff --git a/src/delphi/static/configs/debug.json → configs/debug.json b/src/delphi/static/configs/debug.json → configs/debug.json
@@ -1,12 +1,9 @@
 {
-    "vocab_size": 4096,
     "max_seq_len": 512,
     "max_epochs": 2,
-    "eval_interval": 1,
     "eval_iters": 1,
-    "data_config": {
-        "train_sample_limit": 256
-    },
+    "batch_ordering_seed": 42,
+    "torch_seed": 1337,
     "batch_size": 64,
     "model_config": {
         "model_class": "LlamaForCausalLM",
@@ -16,5 +13,8 @@
         "num_hidden_layers": 2,
         "num_key_value_heads": 2,
         "vocab_size": 4096
+    },
+    "dataset": {
+        "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
     }
 }
diff --git a/...aining_config_examples/sample_config.json → configs/sample_config.json b/...aining_config_examples/sample_config.json → configs/sample_config.json
@@ -1,15 +1,13 @@
 {
   "run_name": "2024_03_15_17_28_14",
   "output_dir": "/Users/jaidhyani/Library/Application Support/delphi",
+  "dataset": {
+    "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
+  },
   "device": "auto",
-  "eval_interval": 2000,
   "log_interval": 1,
   "eval_iters": 100,
-  "eval_only": false,
-  "always_save_checkpoint": false,
-  "init_from": "scratch",
-  "wandb_config": {
-    "log": false,
+  "wandb": {
     "project": "delphi",
     "entity": "set_wandb.entity_to_your_wandb_username_to_make_wandb_logging_work"
   },
@@ -38,16 +36,17 @@
     "vocab_size": 4096
   },
   "max_epochs": 10,
+  "gradient_accumulation_steps": 1,
   "grad_clip": 1.0,
-  "optimizer": {
-    "gradient_accumulation_steps": 4,
+  "adam": {
     "learning_rate": 0.0005,
     "weight_decay": 0.1,
     "beta1": 0.9,
     "beta2": 0.95,
-    "grad_clip": 1.0,
     "decay_lr": true,
     "warmup_iters": 1000,
     "min_lr": 0.0
-  }
+  },
+  "batch_ordering_seed": 42,
+  "torch_seed": 1337
 }
diff --git a/src/delphi/static/configs/debug_mamba.json → configs/sample_mamba.json b/src/delphi/static/configs/debug_mamba.json → configs/sample_mamba.json
@@ -1,13 +1,8 @@
 {
-  "vocab_size": 4096,
   "max_seq_len": 512,
   "max_epochs": 2,
-  "eval_interval": 1,
   "log_interval": 1,
   "eval_iters": 10,
-  "data_config": {
-    "train_sample_limit": 64
-  },
   "batch_size": 8,
   "model_config": {
     "model_class": "MambaForCausalLM",
@@ -18,5 +13,10 @@
     "conv_kernel": 2,
     "expand": 2,
     "time_step_rank": 2
+  },
+  "batch_ordering_seed": 42,
+  "torch_seed": 1337,
+  "dataset": {
+    "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
   }
 }
diff --git a/...g_examples/sample_transformers_bloom.json → configs/sample_transformers_bloom.json b/...g_examples/sample_transformers_bloom.json → configs/sample_transformers_bloom.json
@@ -1,9 +1,7 @@
 {
-    "vocab_size": 4096,
     "max_seq_len": 512,
-    "max_epochs": 10,
-    "eval_interval": 10,
-    "eval_iters": 8,
+    "max_epochs": 2,
+    "eval_iters": 1,
     "batch_size": 64,
     "model_config": {
         "model_class": "BloomForCausalLM",
@@ -12,14 +10,19 @@
         "bos_token_id": 1,
         "eos_token_id": 2,
         "hidden_dropout": 0.0,
-        "hidden_size": 64,
+        "hidden_size": 8,
         "initializer_range": 0.02,
         "layer_norm_epsilon": 1e-05,
-        "n_head": 8,
-        "n_layer": 10,
+        "n_head": 2,
+        "n_layer": 2,
         "pretraining_tp": 1,
         "slow_but_exact": false,
         "use_cache": true,
         "vocab_size": 4096
+    },
+    "batch_ordering_seed": 42,
+    "torch_seed": 1337,
+    "dataset": {
+        "name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
     }
 }
diff --git a/configs/stories/llama2/100k.json b/configs/stories/llama2/100k.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 12,
+    "intermediate_size": 48,
+    "num_attention_heads": 2,
+    "num_hidden_layers": 1,
+    "num_key_value_heads": 1
+  }
+}
diff --git a/configs/stories/llama2/10m.json b/configs/stories/llama2/10m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 332,
+    "intermediate_size": 896,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 6,
+    "num_key_value_heads": 6
+  }
+}
diff --git a/configs/stories/llama2/1m.json b/configs/stories/llama2/1m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 84,
+    "intermediate_size": 256,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 4,
+    "num_key_value_heads": 4
+  }
+}
diff --git a/configs/stories/llama2/2.5m.json b/configs/stories/llama2/2.5m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 168,
+    "intermediate_size": 384,
+    "num_attention_heads": 8,
+    "num_hidden_layers": 4,
+    "num_key_value_heads": 4
+  }
+}
diff --git a/configs/stories/llama2/250k.json b/configs/stories/llama2/250k.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 28,
+    "intermediate_size": 96,
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "num_key_value_heads": 2
+  }
+}
diff --git a/configs/stories/llama2/25m.json b/configs/stories/llama2/25m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 484,
+    "intermediate_size": 1332,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 8,
+    "num_key_value_heads": 8
+  }
+}
diff --git a/configs/stories/llama2/500k.json b/configs/stories/llama2/500k.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 52,
+    "intermediate_size": 184,
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "num_key_value_heads": 2
+  }
+}
diff --git a/configs/stories/llama2/50k.json b/configs/stories/llama2/50k.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 6,
+    "intermediate_size": 24,
+    "num_attention_heads": 2,
+    "num_hidden_layers": 1,
+    "num_key_value_heads": 1
+  }
+}
diff --git a/configs/stories/llama2/50m.json b/configs/stories/llama2/50m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 708,
+    "intermediate_size": 1896,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 8,
+    "num_key_value_heads": 8
+  }
+}
diff --git a/configs/stories/llama2/5m.json b/configs/stories/llama2/5m.json
@@ -0,0 +1,9 @@
+{
+  "model_config": {
+    "hidden_size": 232,
+    "intermediate_size": 512,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 6,
+    "num_key_value_heads": 6
+  }
+}
diff --git a/configs/stories/llama2/README.md b/configs/stories/llama2/README.md
@@ -0,0 +1,7 @@
+not using padding, so pad_token_id not set
+use_cache - using default
+pretraining_tp - experimental parallelization we're not using, which is the default
+tie_word_embeddings - llama2 used False and this is better for interpretability, note that llama2.c is using True by default, which is probably more efficient use of parameters for very small models
+rope settings are widely used defaults
+attention_bias - no biases on QKV and output projection is the default and that's what we're using
+attention_dropout - this is the only dropout llama2 can use, it's set to prob=0 by default and that's what we're using
diff --git a/configs/stories/llama2/base.json b/configs/stories/llama2/base.json
@@ -0,0 +1,52 @@
+{
+    "model_config": {
+        "model_class": "LlamaForCausalLM",
+        "vocab_size": 4096,
+        "hidden_act": "silu",
+        "max_position_embeddings": 512,
+        "initializer_range": 0.02,
+        "rms_norm_eps": 1e-06,
+        "bos_token_id": 0,
+        "eos_token_id": 1,
+        "tie_word_embeddings": false,
+        "rope_theta": 10000.0,
+        "rope_scaling": null,
+        "attention_bias": false,
+        "attention_dropout": 0.0
+    },
+    "max_seq_len": 512,
+    "device": "auto",
+    "checkpoint_interval": 400,
+    "extra_checkpoint_iters": [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        512
+    ],
+    "log_interval": 40,
+    "eval_iters": 10,
+    "batch_size": 256,
+    "max_epochs": 10,
+    "grad_clip": 1.0,
+    "gradient_accumulation_steps": 1,
+    "adam": {
+        "learning_rate": 0.0005,
+        "weight_decay": 0.1,
+        "beta1": 0.9,
+        "beta2": 0.95,
+        "decay_lr": true,
+        "warmup_iters": 1000,
+        "min_lr": 0.0
+    },
+    "batch_ordering_seed": 1337,
+    "torch_seed": 42,
+    "dataset": {
+        "name": "delphi-suite/stories-tokenized"
+    }
+}
diff --git a/configs/stories/mamba/100k.json b/configs/stories/mamba/100k.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 24,
+    "num_hidden_layers": 2
+  }
+}
diff --git a/configs/stories/mamba/10m.json b/configs/stories/mamba/10m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 400,
+    "num_hidden_layers": 8
+  }
+}
diff --git a/configs/stories/mamba/1m.json b/configs/stories/mamba/1m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 112,
+    "num_hidden_layers": 6
+  }
+}
diff --git a/configs/stories/mamba/2.5m.json b/configs/stories/mamba/2.5m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 204,
+    "num_hidden_layers": 6
+  }
+}
diff --git a/configs/stories/mamba/250k.json b/configs/stories/mamba/250k.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 36,
+    "num_hidden_layers": 4
+  }
+}
diff --git a/configs/stories/mamba/25m.json b/configs/stories/mamba/25m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 664,
+    "num_hidden_layers": 8
+  }
+}
diff --git a/configs/stories/mamba/500k.json b/configs/stories/mamba/500k.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 76,
+    "num_hidden_layers": 4
+  }
+}
diff --git a/configs/stories/mamba/50k.json b/configs/stories/mamba/50k.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 12,
+    "num_hidden_layers": 2
+  }
+}
diff --git a/configs/stories/mamba/50m.json b/configs/stories/mamba/50m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 952,
+    "num_hidden_layers": 8
+  }
+}
diff --git a/configs/stories/mamba/5m.json b/configs/stories/mamba/5m.json
@@ -0,0 +1,6 @@
+{
+  "model_config": {
+    "hidden_size": 308,
+    "num_hidden_layers": 6
+  }
+}
diff --git a/configs/stories/mamba/README.md b/configs/stories/mamba/README.md
@@ -0,0 +1,10 @@
+pad_token_id - we're not using pad tokens, do we don't set it
+layer_norm_eps - different than rms norm eps in mamba
+initializer_range - different in mamba & llama
+residual_in_fp32 - mamba specific parameter
+time_step_* - mamba specific, sane defaults
+there is no way to untie embeddings and unembeddings in mamba, they're tied by default
+https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/mamba/modeling_mamba.py#L602-L610
+rescale_prenorm_residual was True in original paper, so we set it to True, despite HF default being false
+using default for use_cache
+state_size is default