Skip to content

Commit

Permalink
[brief] Allow users to stop training after a training step.
Browse files Browse the repository at this point in the history
[detailed]
- This covers the cases where the loss functions go to infinity or
  become NaN, in which case training should stop immediately.
  • Loading branch information
marovira committed May 30, 2024
1 parent f88b9dd commit ce00d4f
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions src/helios/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,9 @@ def _train_on_iteration(self, state: TrainingState) -> None:
if state.global_iteration % accumulation_steps == 0 and not pbar.update():
pbar.refresh()
state.dataset_iter += 1
if self.model.should_training_stop():
training_done = True
break

if (
val_freq is not None
Expand Down Expand Up @@ -939,6 +942,10 @@ def _train_on_epoch(self, state: TrainingState) -> None:
if not ite_pbar.update():
ite_pbar.refresh()

if self.model.should_training_stop():
training_done = True
break

state.dataset_iter = 0

if val_freq is not None and state.global_epoch % val_freq == 0:
Expand Down

0 comments on commit ce00d4f

Please sign in to comment.