From 3138a599b883eff4877024c772a4fb3ae2318762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 22 Nov 2024 09:49:54 +0100 Subject: [PATCH] fix: changing scorers dict size issue when evaluating during training --- changelog.md | 1 + edsnlp/training/trainer.py | 28 +++++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/changelog.md b/changelog.md index 3f3df11cc..5390ea017 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,7 @@ - Fix `join_thread` missing attribute in `SimpleQueue` when cleaning a multiprocessing executor - Support huggingface transformers that do not set `cls_token_id` and `sep_token_id` (we now also look for these tokens in the `special_tokens_map` and `vocab` mappings) +- Fix changing scorers dict size issue when evaluating during training ## v0.14.0 (2024-11-14) diff --git a/edsnlp/training/trainer.py b/edsnlp/training/trainer.py index df37118ed..8e819702d 100644 --- a/edsnlp/training/trainer.py +++ b/edsnlp/training/trainer.py @@ -95,13 +95,15 @@ def set_flat_stats(x, stats): @validate_arguments class GenericScorer: - def __init__(self, speed=True, **scorers): + def __init__(self, speed=True, batch_size: Union[int, str] = 1, **scorers): self.scorers = scorers self.speed = speed + self.batch_size = batch_size def __call__(self, nlp: Pipeline, docs: Iterable[Any]): scores = {} docs = list(docs) + scorers = dict(self.scorers) # Speed if self.speed: @@ -118,9 +120,9 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]): name for name, pipe in nlp.pipeline if isinstance(pipe, BaseNERComponent) ] ner_scorers = { - name: scorer - for name, scorer in self.scorers.items() - if isinstance(scorer, NerMetric) + name: scorers.pop(name) + for name in list(scorers) + if isinstance(scorers[name], NerMetric) } if ner_pipes and ner_scorers: clean_ner_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")] @@ -128,7 +130,11 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]): d.ents = [] d.spans.clear() with nlp.select_pipes(enable=ner_pipes): - ner_preds = list(nlp.pipe(tqdm(clean_ner_docs, desc="Predicting"))) + ner_preds = list( + nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")).set_processing( + batch_size=self.batch_size + ) + ) for name, scorer in ner_scorers.items(): scores[name] = scorer(docs, ner_preds) @@ -139,9 +145,9 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]): if isinstance(pipe, BaseSpanAttributeClassifierComponent) ] span_attr_scorers = { - name: scorer - for name, scorer in self.scorers.items() - if isinstance(scorer, SpanAttributeMetric) + name: scorers.pop(name) + for name in list(scorers) + if isinstance(scorers[name], SpanAttributeMetric) } if qlf_pipes and span_attr_scorers: clean_qlf_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")] @@ -152,7 +158,11 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]): for qlf in nlp.get_pipe(name).attributes: BINDING_SETTERS[(qlf, None)](span) with nlp.select_pipes(disable=ner_pipes): - qlf_preds = list(nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting"))) + qlf_preds = list( + nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")).set_processing( + batch_size=self.batch_size + ) + ) for name, scorer in span_attr_scorers.items(): scores[name] = scorer(docs, qlf_preds)