Skip to content

Commit

Permalink
fix: changing scorers dict size issue when evaluating during training
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Nov 28, 2024
1 parent b3ed86e commit 2e9a843
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 9 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Fix `join_thread` missing attribute in `SimpleQueue` when cleaning a multiprocessing executor
- Support huggingface transformers that do not set `cls_token_id` and `sep_token_id` (we now also look for these tokens in the `special_tokens_map` and `vocab` mappings)
- Fix changing scorers dict size issue when evaluating during training

## v0.14.0 (2024-11-14)

Expand Down
28 changes: 19 additions & 9 deletions edsnlp/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,15 @@ def set_flat_stats(x, stats):

@validate_arguments
class GenericScorer:
def __init__(self, speed=True, **scorers):
def __init__(self, speed=True, batch_size: Union[int, str] = 1, **scorers):
self.scorers = scorers
self.speed = speed
self.batch_size = batch_size

def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
scores = {}
docs = list(docs)
scorers = dict(self.scorers)

# Speed
if self.speed:
Expand All @@ -118,17 +120,21 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
name for name, pipe in nlp.pipeline if isinstance(pipe, BaseNERComponent)
]
ner_scorers = {
name: scorer
for name, scorer in self.scorers.items()
if isinstance(scorer, NerMetric)
name: scorers.pop(name)
for name in list(scorers)
if isinstance(scorers[name], NerMetric)
}
if ner_pipes and ner_scorers:
clean_ner_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
for d in clean_ner_docs:
d.ents = []
d.spans.clear()
with nlp.select_pipes(enable=ner_pipes):
ner_preds = list(nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")))
ner_preds = list(
nlp.pipe(tqdm(clean_ner_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
)
)
for name, scorer in ner_scorers.items():
scores[name] = scorer(docs, ner_preds)

Expand All @@ -139,9 +145,9 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
if isinstance(pipe, BaseSpanAttributeClassifierComponent)
]
span_attr_scorers = {
name: scorer
for name, scorer in self.scorers.items()
if isinstance(scorer, SpanAttributeMetric)
name: scorers.pop(name)
for name in list(scorers)
if isinstance(scorers[name], SpanAttributeMetric)
}
if qlf_pipes and span_attr_scorers:
clean_qlf_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
Expand All @@ -152,7 +158,11 @@ def __call__(self, nlp: Pipeline, docs: Iterable[Any]):
for qlf in nlp.get_pipe(name).attributes:
BINDING_SETTERS[(qlf, None)](span)
with nlp.select_pipes(disable=ner_pipes):
qlf_preds = list(nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")))
qlf_preds = list(
nlp.pipe(tqdm(clean_qlf_docs, desc="Predicting")).set_processing(
batch_size=self.batch_size
)
)
for name, scorer in span_attr_scorers.items():
scores[name] = scorer(docs, qlf_preds)

Expand Down

0 comments on commit 2e9a843

Please sign in to comment.