Skip to content

Commit

Permalink
fix: add autocast and batch_size to eval + better train log
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Dec 12, 2024
1 parent d5d0ee1 commit 0ce923e
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 13 deletions.
6 changes: 3 additions & 3 deletions edsnlp/training/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,9 @@ def __init__(
group["selectors"] = sources
group["params"] = params
cliques.append(group)
cliques = [
{k: v for k, v in group.items() if v is not None} for group in cliques
]
cliques = reversed(
[{k: v for k, v in group.items() if v is not None} for group in cliques]
)

if isinstance(optim, str):
optim = (
Expand Down
48 changes: 38 additions & 10 deletions edsnlp/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,17 @@ def set_flat_stats(x, stats):

@validate_arguments
class GenericScorer:
def __init__(self, speed=True, batch_size: Union[int, str] = 1, **scorers):
def __init__(
self,
speed: bool = True,
batch_size: Union[int, str] = 1,
autocast: Union[bool, Any] = None,
**scorers,
):
self.scorers = scorers
self.speed = speed
self.batch_size = batch_size
self.autocast = autocast

def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
scores = {}
Expand All @@ -115,7 +122,14 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
# Speed
if self.speed:
t0 = time.time()
list(nlp.pipe(d.copy() for d in tqdm(docs, desc="Computing model speed")))
list(
nlp.pipe(
d.copy() for d in tqdm(docs, desc="Computing model speed")
).set_processing(
batch_size=self.batch_size,
autocast=self.autocast,
)
)
duration = time.time() - t0
scores["speed"] = dict(
wps=sum(len(d) for d in docs) / duration,
Expand All @@ -139,7 +153,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
with nlp.select_pipes(enable=ner_pipes):
ner_preds = list(
nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
batch_size=self.batch_size,
autocast=self.autocast,
)
)
for name, scorer in ner_scorers.items():
Expand Down Expand Up @@ -167,7 +182,8 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
with nlp.select_pipes(disable=ner_pipes):
qlf_preds = list(
nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
batch_size=self.batch_size,
autocast=self.autocast,
)
)
for name, scorer in span_attr_scorers.items():
Expand All @@ -176,7 +192,12 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
# Custom scorers
for name, scorer in scorers.items():
pred_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
preds = list(nlp.pipe(tqdm(pred_docs, desc="Predicting")))
preds = list(
nlp.pipe(tqdm(pred_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size,
autocast=self.autocast,
)
)
scores[name] = scorer(docs, preds)

return scores
Expand Down Expand Up @@ -242,7 +263,7 @@ def __init__(
self,
data: Stream,
batch_size: BatchSizeArg,
shuffle: str,
shuffle: Union[str, Literal[False]],
sub_batch_size: Optional[BatchSizeArg] = None,
pipe_names: Optional[Collection[str]] = None,
post_init: bool = True,
Expand Down Expand Up @@ -445,6 +466,7 @@ def train(
# accelerator.register_for_checkpointing(dataset)
is_main_process = accelerator.is_main_process
device = accelerator.device
accelerator.print(config_meta["unresolved_config"].to_yaml_str())

output_dir = Path(output_dir or Path.cwd() / "artifacts")
output_model_dir = output_model_dir or output_dir / "model-last"
Expand All @@ -453,7 +475,7 @@ def train(
os.makedirs(output_dir, exist_ok=True)
if config_meta is not None: # pragma: no cover
print(config_meta["unresolved_config"].to_yaml_str())
config_meta["unresolved_config"].to_disk(output_dir / "training_config.yml")
config_meta["unresolved_config"].to_disk(output_dir / "train_config.yml")

validation_interval = validation_interval or max_steps // 10
checkpoint_interval = checkpoint_interval or validation_interval
Expand Down Expand Up @@ -515,10 +537,12 @@ def train(
accelerator.print(
"Optimizing groups:"
+ "".join(
"\n - {} {} weight tensors ({:,} parameters)".format(
g.get("selector", "*") + ":" if "selector" in g else "",
"\n - {} weight tensors ({:,} parameters){}".format(
len([p for p in g["params"] if p in grad_params]),
sum([p.numel() for p in g["params"] if p in grad_params]),
": " + " & ".join(g.get("selectors", "*"))
if "selectors" in g
else "",
)
for g in optim.param_groups
)
Expand Down Expand Up @@ -563,7 +587,11 @@ def train(
disable=not is_main_process,
smoothing=0.3,
):
if is_main_process and (step % validation_interval) == 0:
if (
is_main_process
and step > 0
and (step % validation_interval) == 0
):
scores = scorer(nlp, val_docs) if val_docs else {}
all_metrics.append(
{
Expand Down

0 comments on commit 0ce923e

Please sign in to comment.