diff --git a/TransformerLens b/TransformerLens
index 3934846..ebf1e06 160000
--- a/TransformerLens
+++ b/TransformerLens
@@ -1 +1 @@
-Subproject commit 39348462033b7c7dcd80d93cefac8f0b9bfc4382
+Subproject commit ebf1e060e017460e90ecf2e482d196cf936b5fc1
diff --git a/src/lm_saes/activation/activation_dataset.py b/src/lm_saes/activation/activation_dataset.py
index 2638962..222093d 100644
--- a/src/lm_saes/activation/activation_dataset.py
+++ b/src/lm_saes/activation/activation_dataset.py
@@ -5,9 +5,7 @@
 from tqdm.auto import tqdm
 from transformer_lens import HookedTransformer
 
-from ..config import (
-    ActivationGenerationConfig,
-)
+from ..config import ActivationGenerationConfig
 from ..utils.misc import is_master, print_once
 from .activation_store import ActivationStore
 from .token_source import TokenSource
@@ -90,6 +88,7 @@ def make_activation_dataset(model: HookedTransformer, cfg: ActivationGenerationC
     pbar = tqdm(
         total=total_generating_tokens,
         desc=f"Activation dataset Rank {dist.get_rank()}" if dist.is_initialized() else "Activation dataset",
+        smoothing=0.001,
     )
 
     while n_tokens < total_generating_tokens:
diff --git a/src/lm_saes/activation/activation_store.py b/src/lm_saes/activation/activation_store.py
index 4ee1aee..c637ab1 100644
--- a/src/lm_saes/activation/activation_store.py
+++ b/src/lm_saes/activation/activation_store.py
@@ -6,6 +6,7 @@
 import torch.distributed._functional_collectives as funcol
 import torch.utils.data
 from torch.distributed.device_mesh import init_device_mesh
+from tqdm import tqdm
 from transformer_lens import HookedTransformer
 
 from ..config import ActivationStoreConfig
@@ -26,7 +27,10 @@ def __init__(self, act_source: ActivationSource, cfg: ActivationStoreConfig):
         self.tp_size = cfg.tp_size
         self._store: Dict[str, torch.Tensor] = {}
         self._all_gather_buffer: Dict[str, torch.Tensor] = {}
-        self.device_mesh = init_device_mesh("cuda", (self.ddp_size, self.tp_size), mesh_dim_names=("ddp", "tp"))
+        if self.tp_size > 1 or self.ddp_size > 1:
+            self.device_mesh = init_device_mesh("cuda", (self.ddp_size, self.tp_size), mesh_dim_names=("ddp", "tp"))
+        else:
+            self.device_mesh = None
 
     def initialize(self):
         self.refill()
@@ -41,6 +45,14 @@ def shuffle(self):
             self._store[k] = self._store[k][perm]
 
     def refill(self):
+        pbar = tqdm(
+            total=self.buffer_size,
+            desc="Refilling activation store",
+            smoothing=0,
+            leave=False,
+            initial=self.__len__(),
+        )
+        n_seqs = 0
         while self.__len__() < self.buffer_size:
             new_act = self.act_source.next()
             if new_act is None:
@@ -53,6 +65,10 @@ def refill(self):
                     self._store[k] = torch.cat([self._store[k], v], dim=0)
             # Check if all activations have the same size
             assert len(set(v.size(0) for v in self._store.values())) == 1
+            n_seqs += 1
+            pbar.update(next(iter(new_act.values())).size(0))
+            pbar.set_postfix({"Sequences": n_seqs})
+        pbar.close()
 
     def __len__(self):
         if len(self._store) == 0:
@@ -75,6 +91,7 @@ def next(self, batch_size) -> Dict[str, torch.Tensor] | None:
         if dist.is_initialized():  # Wait for all processes to refill the store
             dist.barrier()
         if self.tp_size > 1:
+            assert self.device_mesh is not None, "Device mesh not initialized"
             for k, v in self._store.items():
                 if k not in self._all_gather_buffer:
                     self._all_gather_buffer[k] = torch.empty(size=(0,), dtype=v.dtype, device=self.device)
diff --git a/src/lm_saes/activation/token_source.py b/src/lm_saes/activation/token_source.py
index a018f6b..35d7506 100644
--- a/src/lm_saes/activation/token_source.py
+++ b/src/lm_saes/activation/token_source.py
@@ -43,7 +43,18 @@ def __init__(
 
     def fill_with_one_batch(self, batch: dict[str, Any], pack: bool, prepend_bos: bool) -> None:
         if self.is_dataset_tokenized:
-            tokens: torch.Tensor = batch["tokens"].to(self.device)
+            if isinstance(batch["input_ids"], torch.Tensor):
+                assert not batch["input_ids"].dtype.is_floating_point, "input_ids must be a tensor of integers"
+                tokens = batch["input_ids"].to(self.device)
+            else:
+                assert isinstance(batch["input_ids"], list), "input_ids must be a list or a tensor"
+                print("Batch size:", len(batch["input_ids"]), "Type:", type(batch["input_ids"]))
+                print("Sequence length:", len(batch["input_ids"][0]), "Type:", type(batch["input_ids"][0]))
+                # Check if all sequences in the batch have the same length
+                assert all(
+                    len(seq) == len(batch["input_ids"][0]) for seq in batch["input_ids"]
+                ), "All sequences must have the same length"
+                tokens = torch.tensor(batch["input_ids"], dtype=torch.long, device=self.device)
         else:
             tokens = self.model.to_tokens(batch["text"], prepend_bos=prepend_bos).to(self.device)
         if pack:
@@ -124,6 +135,7 @@ def _process_dataset(dataset_path: str, cfg: TextDatasetConfig):
             shard = dataset.shard(num_shards=dist.get_world_size(), index=shard_id, contiguous=True)
         else:
             shard = dataset
+        shard = shard.with_format("torch")
 
         dataloader = DataLoader(
             dataset=cast(Dataset[dict[str, Any]], shard), batch_size=cfg.store_batch_size, pin_memory=True
diff --git a/src/lm_saes/entrypoint.py b/src/lm_saes/entrypoint.py
index f0961a1..f952a9a 100644
--- a/src/lm_saes/entrypoint.py
+++ b/src/lm_saes/entrypoint.py
@@ -9,6 +9,7 @@ class SupportedRunner(Enum):
     EVAL = "eval"
     ANALYZE = "analyze"
     PRUNE = "prune"
+    GENERATE_ACTIVATIONS = "gen-activations"
 
     def __str__(self):
         return self.value
@@ -97,6 +98,12 @@ def entrypoint():
 
         config = LanguageModelSAEPruningConfig.from_flattened(config)
         language_model_sae_prune_runner(config)
+    elif args.runner == SupportedRunner.GENERATE_ACTIVATIONS:
+        from lm_saes.config import ActivationGenerationConfig
+        from lm_saes.runner import activation_generation_runner
+
+        config = ActivationGenerationConfig.from_flattened(config)
+        activation_generation_runner(config)
     else:
         raise ValueError(f"Unsupported runner: {args.runner}.")
 
diff --git a/src/lm_saes/runner.py b/src/lm_saes/runner.py
index 4221ad8..a629543 100644
--- a/src/lm_saes/runner.py
+++ b/src/lm_saes/runner.py
@@ -11,7 +11,12 @@
     parallelize_module,
 )
 from transformer_lens import HookedTransformer
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    ChameleonForConditionalGeneration,
+    PreTrainedModel,
+)
 
 from .activation.activation_dataset import make_activation_dataset
 from .activation.activation_source import CachedActivationSource
@@ -21,6 +26,7 @@
 from .config import (
     ActivationGenerationConfig,
     FeaturesDecoderConfig,
+    LanguageModelConfig,
     LanguageModelCrossCoderTrainingConfig,
     LanguageModelSAEAnalysisConfig,
     LanguageModelSAEPruningConfig,
@@ -36,36 +42,47 @@
 from .utils.misc import is_master
 
 
+def get_model(cfg: LanguageModelConfig):
+    if "chameleon" in cfg.model_name:
+        hf_model = ChameleonForConditionalGeneration.from_pretrained(
+            (cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path),
+            cache_dir=cfg.cache_dir,
+            local_files_only=cfg.local_files_only,
+            torch_dtype=cfg.dtype,
+        )
+    else:
+        hf_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+            (cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path),
+            cache_dir=cfg.cache_dir,
+            local_files_only=cfg.local_files_only,
+            torch_dtype=cfg.dtype,
+        )
+    hf_tokenizer = AutoTokenizer.from_pretrained(
+        (cfg.model_name if cfg.model_from_pretrained_path is None else cfg.model_from_pretrained_path),
+        trust_remote_code=True,
+        use_fast=True,
+        add_bos_token=True,
+    )
+    model = HookedTransformer.from_pretrained_no_processing(
+        cfg.model_name,
+        use_flash_attn=cfg.use_flash_attn,
+        device=cfg.device,
+        cache_dir=cfg.cache_dir,
+        hf_model=hf_model,
+        tokenizer=hf_tokenizer,
+        dtype=cfg.dtype,
+    )
+    model.eval()
+    return model
+
+
 def language_model_sae_runner(cfg: LanguageModelSAETrainingConfig):
     if cfg.act_store.use_cached_activations:
         activation_source = CachedActivationSource(cfg.act_store)
         activation_store = ActivationStore(act_source=activation_source, cfg=cfg.act_store)
         model = None
     else:
-        hf_model = AutoModelForCausalLM.from_pretrained(
-            (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-            cache_dir=cfg.lm.cache_dir,
-            local_files_only=cfg.lm.local_files_only,
-            torch_dtype=cfg.lm.dtype,
-        )
-        hf_tokenizer = AutoTokenizer.from_pretrained(
-            (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-            trust_remote_code=True,
-            use_fast=True,
-            add_bos_token=True,
-        )
-
-        model = HookedTransformer.from_pretrained_no_processing(
-            cfg.lm.model_name,
-            use_flash_attn=cfg.lm.use_flash_attn,
-            device=cfg.lm.device,
-            cache_dir=cfg.lm.cache_dir,
-            hf_model=hf_model,
-            tokenizer=hf_tokenizer,
-            dtype=cfg.lm.dtype,
-        )
-        model.offload_params_after(cfg.act_store.hook_points[-1], torch.tensor([[0]], device=cfg.lm.device))
-        model.eval()
+        model = get_model(cfg.lm)
         activation_store = ActivationStore.from_config(model=model, cfg=cfg.act_store)
 
     if not cfg.finetuning and (
@@ -182,28 +199,7 @@ def language_model_sae_prune_runner(cfg: LanguageModelSAEPruningConfig):
     cfg.sae.save_hyperparameters(os.path.join(cfg.exp_result_path))
     cfg.lm.save_lm_config(os.path.join(cfg.exp_result_path))
     sae = SparseAutoEncoder.from_config(cfg=cfg.sae)
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        cache_dir=cfg.lm.cache_dir,
-        local_files_only=cfg.lm.local_files_only,
-        torch_dtype=cfg.lm.dtype,
-    )
-    hf_tokenizer = AutoTokenizer.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        trust_remote_code=True,
-        use_fast=True,
-        add_bos_token=True,
-    )
-    model = HookedTransformer.from_pretrained_no_processing(
-        cfg.lm.model_name,
-        use_flash_attn=cfg.lm.use_flash_attn,
-        device=cfg.lm.device,
-        cache_dir=cfg.lm.cache_dir,
-        hf_model=hf_model,
-        tokenizer=hf_tokenizer,
-        dtype=cfg.lm.dtype,
-    )
-    model.eval()
+    model = get_model(cfg.lm)
     activation_store = ActivationStore.from_config(model=model, cfg=cfg.act_store)
     if cfg.wandb.log_to_wandb and is_master():
         wandb_config: dict = {
@@ -243,29 +239,7 @@ def language_model_sae_prune_runner(cfg: LanguageModelSAEPruningConfig):
 
 def language_model_sae_eval_runner(cfg: LanguageModelSAERunnerConfig):
     sae = SparseAutoEncoder.from_config(cfg=cfg.sae)
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        cache_dir=cfg.lm.cache_dir,
-        local_files_only=cfg.lm.local_files_only,
-    )
-
-    hf_tokenizer = AutoTokenizer.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        trust_remote_code=True,
-        use_fast=True,
-        add_bos_token=True,
-    )
-    model = HookedTransformer.from_pretrained_no_processing(
-        cfg.lm.model_name,
-        use_flash_attn=cfg.lm.use_flash_attn,
-        device=cfg.lm.device,
-        cache_dir=cfg.lm.cache_dir,
-        hf_model=hf_model,
-        tokenizer=hf_tokenizer,
-        dtype=cfg.lm.dtype,
-    )
-
-    model.eval()
+    model = get_model(cfg.lm)
     activation_store = ActivationStore.from_config(model=model, cfg=cfg.act_store)
 
     if cfg.wandb.log_to_wandb and is_master():
@@ -301,27 +275,7 @@ def language_model_sae_eval_runner(cfg: LanguageModelSAERunnerConfig):
 
 
 def activation_generation_runner(cfg: ActivationGenerationConfig):
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        cache_dir=cfg.lm.cache_dir,
-        local_files_only=cfg.lm.local_files_only,
-    )
-    hf_tokenizer = AutoTokenizer.from_pretrained(
-        (cfg.lm.model_name if cfg.lm.model_from_pretrained_path is None else cfg.lm.model_from_pretrained_path),
-        trust_remote_code=True,
-        use_fast=True,
-        add_bos_token=True,
-    )
-    model = HookedTransformer.from_pretrained_no_processing(
-        cfg.lm.model_name,
-        use_flash_attn=cfg.lm.use_flash_attn,
-        device=cfg.lm.device,
-        cache_dir=cfg.lm.cache_dir,
-        hf_model=hf_model,
-        tokenizer=hf_tokenizer,
-        dtype=cfg.lm.dtype,
-    )
-    model.eval()
+    model = get_model(cfg.lm)
 
     make_activation_dataset(model, cfg)