From 4aaf3c541b50ae556dc1da666b0060db0dfad6ae Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Wed, 31 Jul 2024 09:17:25 -0400 Subject: [PATCH] Add loss tracking --- benchmarks/llm/dev.yaml | 3 ++- benchmarks/llm/recipes/full_finetune_distributed.py | 4 +--- benchmarks/llm/recipes/lora_finetune_single_device.py | 3 +-- benchmarks/llm/voirfile.py | 10 +++++++++- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmarks/llm/dev.yaml b/benchmarks/llm/dev.yaml index ec325b6da..b28eaf65e 100644 --- a/benchmarks/llm/dev.yaml +++ b/benchmarks/llm/dev.yaml @@ -24,13 +24,14 @@ llm-qlora-single: "{milabench_code}/recipes/lora_finetune_single_device.py": true --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml" epochs=1: true - batch_size=2: true tokenizer.path={milabench_data}/{milabench_name}/chckpt/original/tokenizer.model: true output_dir={milabench_extra}/output: true checkpointer.checkpoint_dir={milabench_data}/{milabench_name}/chckpt/original: true checkpointer.output_dir={milabench_extra}/chckpt: true metric_logger.log_dir={milabench_extra}/metrics: true repo_id="meta-llama/Meta-Llama-3.1-8B": true + batch_size=8: true + gradient_accumulation_steps=8: true llm-qlora-ddp-gpus: diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index b7cfa040a..5c68f3ee9 100644 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -523,9 +523,7 @@ def train(self) -> None: logits = logits.transpose(1, 2) # Compute loss loss = self._loss_fn(logits, labels) - - loss = loss / self._gradient_accumulation_steps - running_loss += loss + running_loss += loss / self._gradient_accumulation_steps loss.backward() # Step with optimizer diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py index f8c4422a8..aa30dcbfd 100644 --- a/benchmarks/llm/recipes/lora_finetune_single_device.py +++ b/benchmarks/llm/recipes/lora_finetune_single_device.py @@ -572,8 +572,7 @@ def train(self) -> None: labels = labels[..., 1:].contiguous() logits = logits.transpose(1, 2) # Compute loss - loss = self._loss_fn(logits, labels) - loss = loss / self._gradient_accumulation_steps + loss = self._loss_fn(logits, labels) / self._gradient_accumulation_steps running_loss += loss loss.backward() diff --git a/benchmarks/llm/voirfile.py b/benchmarks/llm/voirfile.py index 18e19c36d..c099079f0 100644 --- a/benchmarks/llm/voirfile.py +++ b/benchmarks/llm/voirfile.py @@ -51,10 +51,15 @@ def wrap_lr_scheduler(scheduler): def newstep(*args, **kwargs): original(*args, **kwargs) - observer.step() + # observer.step() scheduler.step = newstep return scheduler + + def wrap_loss(loss): + observer.record_loss(loss) + observer.step() + return loss probe = ov.probe("//LoRAFinetuneRecipeSingleDevice/_setup_data() as loader", overridable=True) probe['loader'].override(wrap_dataloader) @@ -62,6 +67,9 @@ def newstep(*args, **kwargs): probe = ov.probe("//LoRAFinetuneRecipeSingleDevice/_setup_lr_scheduler() as scheduler", overridable=True) probe['scheduler'].override(wrap_lr_scheduler) + probe = ov.probe("//LoRAFinetuneRecipeSingleDevice/train > loss_to_log", overridable=True) + probe['loss_to_log'].override(wrap_loss) + try: yield ov.phases.run_script except StopProgram: