feat(model): support llama3_1

OpenMOSS · Jul 24, 2024 · 3019a37 · 3019a37
1 parent 9264e4b
commit 3019a37
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 7 deletions.
diff --git a/TransformerLens/transformer_lens/loading_from_pretrained.py b/TransformerLens/transformer_lens/loading_from_pretrained.py
@@ -122,7 +122,9 @@
     "CodeLlama-7b-Python-hf",
     "CodeLlama-7b-Instruct-hf",
     "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Meta-Llama-3.1-8B",
     "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
     "meta-llama/Meta-Llama-3-70B",
     "meta-llama/Meta-Llama-3-70B-Instruct",
     "Baidicoot/Othello-GPT-Transformer-Lens",
@@ -809,6 +811,25 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "final_rms": True,
             "gated_mlp": True,
         }
+    elif "Meta-Llama-3.1-8B" in official_model_name:
+        cfg_dict = {
+            "d_model": 4096,
+            "d_head": 128,
+            "n_heads": 32,
+            "d_mlp": 14336,
+            "n_layers": 32,
+            "n_ctx": 8192,
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 128,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
     elif "Meta-Llama-3-70B" in official_model_name:
         cfg_dict = {
             "d_model": 8192,

diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ authors = [
 ]
 dependencies = [
     "datasets>=2.17.0",
+    "transformers>=4.43.0",
     "einops>=0.7.0",
     "fastapi>=0.110.0",
     "matplotlib>=3.8.3",

diff --git a/src/lm_saes/sae.py b/src/lm_saes/sae.py
@@ -624,8 +624,8 @@ def from_initialization_searching(
         cfg: LanguageModelSAETrainingConfig,
     ):
         test_batch = activation_store.next(
-            batch_size=cfg.train_batch_size * 8
-        )  # just random hard code xd
+            batch_size=cfg.train_batch_size
+        )
         activation_in, activation_out = test_batch[cfg.sae.hook_point_in], test_batch[cfg.sae.hook_point_out]  # type: ignore
 
         if (