Skip to content

Commit

Permalink
llama2 & mamba training configs (#113)
Browse files Browse the repository at this point in the history
* Llama 2 example scripts

* Mamba example scripts

* Added all training config sizes
Added an example Llama2 base, Mamba base and all sizes in their respective folders.

* Moving and reformating configs

* Llama 2 example scripts

* Mamba example scripts

* Cleaning extra files

* remove src/llama2c

* moved/renamed configs

* bos/eos token ids

* updated configs following the meeting

* static stuff

* updated base configs and simplified config structure

* grad_acc_steps & batch_size fix

* add @beartype to config classes that don't have it yet

* don't ignore incorrect config keys

* gradient_accumulation_steps fix

* fix broken test config

* Updating test configs to work with recent changes

* config testing and fixes

* re-adding accidentally deleted test config

* fix tests that broke after recent changes

* remove minibatch divisibility requirement

* estimate_loss returns float, not tensor

* log train/validation dataset size when training starts

* fix incorrect train split default

* Updating log spaced checkpoints and checkpointing intervals

* update llama2 configs

* stories cfgs: checkpoints, evals, bos & eos

---------

Co-authored-by: Jett <[email protected]>
Co-authored-by: JaiDhyani <[email protected]>
Co-authored-by: Jannik Brinkmann <[email protected]>
  • Loading branch information
4 people authored Apr 24, 2024
1 parent ad2936f commit 262972b
Show file tree
Hide file tree
Showing 69 changed files with 531 additions and 516 deletions.
10 changes: 5 additions & 5 deletions src/delphi/static/configs/debug.json → configs/debug.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
{
"vocab_size": 4096,
"max_seq_len": 512,
"max_epochs": 2,
"eval_interval": 1,
"eval_iters": 1,
"data_config": {
"train_sample_limit": 256
},
"batch_ordering_seed": 42,
"torch_seed": 1337,
"batch_size": 64,
"model_config": {
"model_class": "LlamaForCausalLM",
Expand All @@ -16,5 +13,8 @@
"num_hidden_layers": 2,
"num_key_value_heads": 2,
"vocab_size": 4096
},
"dataset": {
"name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
}
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
{
"run_name": "2024_03_15_17_28_14",
"output_dir": "/Users/jaidhyani/Library/Application Support/delphi",
"dataset": {
"name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
},
"device": "auto",
"eval_interval": 2000,
"log_interval": 1,
"eval_iters": 100,
"eval_only": false,
"always_save_checkpoint": false,
"init_from": "scratch",
"wandb_config": {
"log": false,
"wandb": {
"project": "delphi",
"entity": "set_wandb.entity_to_your_wandb_username_to_make_wandb_logging_work"
},
Expand Down Expand Up @@ -38,16 +36,17 @@
"vocab_size": 4096
},
"max_epochs": 10,
"gradient_accumulation_steps": 1,
"grad_clip": 1.0,
"optimizer": {
"gradient_accumulation_steps": 4,
"adam": {
"learning_rate": 0.0005,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
"grad_clip": 1.0,
"decay_lr": true,
"warmup_iters": 1000,
"min_lr": 0.0
}
},
"batch_ordering_seed": 42,
"torch_seed": 1337
}
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
{
"vocab_size": 4096,
"max_seq_len": 512,
"max_epochs": 2,
"eval_interval": 1,
"log_interval": 1,
"eval_iters": 10,
"data_config": {
"train_sample_limit": 64
},
"batch_size": 8,
"model_config": {
"model_class": "MambaForCausalLM",
Expand All @@ -18,5 +13,10 @@
"conv_kernel": 2,
"expand": 2,
"time_step_rank": 2
},
"batch_ordering_seed": 42,
"torch_seed": 1337,
"dataset": {
"name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
{
"vocab_size": 4096,
"max_seq_len": 512,
"max_epochs": 10,
"eval_interval": 10,
"eval_iters": 8,
"max_epochs": 2,
"eval_iters": 1,
"batch_size": 64,
"model_config": {
"model_class": "BloomForCausalLM",
Expand All @@ -12,14 +10,19 @@
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_dropout": 0.0,
"hidden_size": 64,
"hidden_size": 8,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"n_head": 8,
"n_layer": 10,
"n_head": 2,
"n_layer": 2,
"pretraining_tp": 1,
"slow_but_exact": false,
"use_cache": true,
"vocab_size": 4096
},
"batch_ordering_seed": 42,
"torch_seed": 1337,
"dataset": {
"name": "delphi-suite/v0-tinystories-v2-clean-tokenized"
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/100k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 12,
"intermediate_size": 48,
"num_attention_heads": 2,
"num_hidden_layers": 1,
"num_key_value_heads": 1
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/10m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 332,
"intermediate_size": 896,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"num_key_value_heads": 6
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/1m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 84,
"intermediate_size": 256,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/2.5m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 168,
"intermediate_size": 384,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/250k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 28,
"intermediate_size": 96,
"num_attention_heads": 4,
"num_hidden_layers": 2,
"num_key_value_heads": 2
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/25m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 484,
"intermediate_size": 1332,
"num_attention_heads": 16,
"num_hidden_layers": 8,
"num_key_value_heads": 8
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/500k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 52,
"intermediate_size": 184,
"num_attention_heads": 4,
"num_hidden_layers": 2,
"num_key_value_heads": 2
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/50k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 6,
"intermediate_size": 24,
"num_attention_heads": 2,
"num_hidden_layers": 1,
"num_key_value_heads": 1
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/50m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 708,
"intermediate_size": 1896,
"num_attention_heads": 16,
"num_hidden_layers": 8,
"num_key_value_heads": 8
}
}
9 changes: 9 additions & 0 deletions configs/stories/llama2/5m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"model_config": {
"hidden_size": 232,
"intermediate_size": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"num_key_value_heads": 6
}
}
7 changes: 7 additions & 0 deletions configs/stories/llama2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
not using padding, so pad_token_id not set
use_cache - using default
pretraining_tp - experimental parallelization we're not using, which is the default
tie_word_embeddings - llama2 used False and this is better for interpretability, note that llama2.c is using True by default, which is probably more efficient use of parameters for very small models
rope settings are widely used defaults
attention_bias - no biases on QKV and output projection is the default and that's what we're using
attention_dropout - this is the only dropout llama2 can use, it's set to prob=0 by default and that's what we're using
52 changes: 52 additions & 0 deletions configs/stories/llama2/base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"model_config": {
"model_class": "LlamaForCausalLM",
"vocab_size": 4096,
"hidden_act": "silu",
"max_position_embeddings": 512,
"initializer_range": 0.02,
"rms_norm_eps": 1e-06,
"bos_token_id": 0,
"eos_token_id": 1,
"tie_word_embeddings": false,
"rope_theta": 10000.0,
"rope_scaling": null,
"attention_bias": false,
"attention_dropout": 0.0
},
"max_seq_len": 512,
"device": "auto",
"checkpoint_interval": 400,
"extra_checkpoint_iters": [
1,
2,
4,
8,
16,
32,
64,
128,
256,
512
],
"log_interval": 40,
"eval_iters": 10,
"batch_size": 256,
"max_epochs": 10,
"grad_clip": 1.0,
"gradient_accumulation_steps": 1,
"adam": {
"learning_rate": 0.0005,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
"decay_lr": true,
"warmup_iters": 1000,
"min_lr": 0.0
},
"batch_ordering_seed": 1337,
"torch_seed": 42,
"dataset": {
"name": "delphi-suite/stories-tokenized"
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/100k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 24,
"num_hidden_layers": 2
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/10m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 400,
"num_hidden_layers": 8
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/1m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 112,
"num_hidden_layers": 6
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/2.5m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 204,
"num_hidden_layers": 6
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/250k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 36,
"num_hidden_layers": 4
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/25m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 664,
"num_hidden_layers": 8
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/500k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 76,
"num_hidden_layers": 4
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/50k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 12,
"num_hidden_layers": 2
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/50m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 952,
"num_hidden_layers": 8
}
}
6 changes: 6 additions & 0 deletions configs/stories/mamba/5m.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"model_config": {
"hidden_size": 308,
"num_hidden_layers": 6
}
}
10 changes: 10 additions & 0 deletions configs/stories/mamba/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
pad_token_id - we're not using pad tokens, do we don't set it
layer_norm_eps - different than rms norm eps in mamba
initializer_range - different in mamba & llama
residual_in_fp32 - mamba specific parameter
time_step_* - mamba specific, sane defaults
there is no way to untie embeddings and unembeddings in mamba, they're tied by default
https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/mamba/modeling_mamba.py#L602-L610
rescale_prenorm_residual was True in original paper, so we set it to True, despite HF default being false
using default for use_cache
state_size is default
Loading

0 comments on commit 262972b

Please sign in to comment.