diff --git a/changelog.md b/changelog.md index b0e85e39c..c4ba0b6ac 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,17 @@ # Changelog +## Unreleased + +### Added + +- New `to_duration` method to convert an absolute date into a date relative to the note_datetime (or None) +- New `use_date_label` in `eds.dates` to store absolute and relative dates under a same `date` label (instead of `absolute` and `relative`) + +### Changed + +- Duration time entities (from `eds.dates`) are now stored in the `durations` span group, different than the `dates` span group +- `to_datetime` now only return absolute dates, converts relative dates into absolute if `doc._.note_datetime` is given, and None otherwise + ## v0.8.0 (2023-03-09) ### Added diff --git a/edsnlp/pipelines/misc/dates/dates.py b/edsnlp/pipelines/misc/dates/dates.py index cc73aa293..ef8768231 100644 --- a/edsnlp/pipelines/misc/dates/dates.py +++ b/edsnlp/pipelines/misc/dates/dates.py @@ -55,6 +55,9 @@ class Dates(BaseComponent): Whether to treat dates as entities attr : str spaCy attribute to use + use_date_label: bool + Whether to use a shared `date` label for absolute and relative dates + instead of `absolute` and `relative` labels """ # noinspection PyProtectedMember @@ -70,8 +73,10 @@ def __init__( detect_time: bool, as_ents: bool, attr: str, + use_date_label: bool = False, ): + self.use_date_label = use_date_label self.nlp = nlp if absolute is None: @@ -195,8 +200,12 @@ def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]: for span, groupdict in dates: if span.label_ == "relative": parsed = RelativeDate.parse_obj(groupdict) + if self.use_date_label: + span.label_ = "date" elif span.label_ == "absolute": parsed = AbsoluteDate.parse_obj(groupdict) + if self.use_date_label: + span.label_ = "date" else: parsed = Duration.parse_obj(groupdict) @@ -277,7 +286,8 @@ def __call__(self, doc: Doc) -> Doc: dates = self.process(doc) dates = self.parse(dates) - doc.spans["dates"] = dates + doc.spans["dates"] = [d for d in dates if d.label_ != "duration"] + doc.spans["durations"] = [d for d in dates if d.label_ == "duration"] if self.detect_periods: doc.spans["periods"] = self.process_periods(dates) diff --git a/edsnlp/pipelines/misc/dates/factory.py b/edsnlp/pipelines/misc/dates/factory.py index 0049b1522..20b84fe7d 100644 --- a/edsnlp/pipelines/misc/dates/factory.py +++ b/edsnlp/pipelines/misc/dates/factory.py @@ -35,6 +35,7 @@ def create_component( detect_time: bool = True, as_ents: bool = False, attr: str = "LOWER", + use_date_label: bool = False, ): """ Tags and normalizes dates, using the open-source `dateparser` library. @@ -73,6 +74,9 @@ def create_component( Whether to treat dates as entities attr : str spaCy attribute to use + use_date_label: bool + Whether to use a shared `date` label for absolute and relative dates + instead of `absolute` and `relative` labels """ return Dates( nlp, @@ -85,4 +89,5 @@ def create_component( detect_time=detect_time, as_ents=as_ents, attr=attr, + use_date_label=use_date_label, ) diff --git a/tests/pipelines/misc/test_dates.py b/tests/pipelines/misc/test_dates.py index 4f404008e..5e055fdf9 100644 --- a/tests/pipelines/misc/test_dates.py +++ b/tests/pipelines/misc/test_dates.py @@ -84,10 +84,12 @@ def test_dates_component(blank_nlp: Language): doc = blank_nlp(text) - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) assert len(doc.ents) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip( + (*doc.spans["dates"], *doc.spans["durations"]), entities + ): assert span.text == text[entity.start_char : entity.end_char] date = span._.date @@ -227,9 +229,11 @@ def test_time(with_time: bool): doc = nlp(text) - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip( + (*doc.spans["dates"], *doc.spans["durations"]), entities + ): assert span.text == text[entity.start_char : entity.end_char] norm = next(m.value for m in entity.modifiers if m.key == "norm") assert span._.date.norm() == norm @@ -257,7 +261,7 @@ def test_false_positives(blank_nlp: Language): for example in counter_examples: doc = blank_nlp(example) - assert len(doc.spans["dates"]) == 0 + assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0 def test_dates_on_ents_only(): @@ -280,9 +284,9 @@ def test_dates_on_ents_only(): assert len(doc.ents) == 1 - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip((*doc.spans["dates"], *doc.spans["durations"]), entities): assert span.text == text[entity.start_char : entity.end_char] @@ -293,5 +297,11 @@ def test_illegal_dates(blank_nlp): ) for text in texts: doc = blank_nlp(text) - ent = doc.spans["dates"][0] + ent = (*doc.spans["dates"], *doc.spans["durations"])[0] assert ent._.date.to_datetime() is None + + +def test_date_label(): + nlp = spacy.blank("eds") + nlp.add_pipe("eds.dates", config={"use_date_label": True, "as_ents": True}) + assert nlp("Le 31/06/17, la dernière dose.").ents[0].label_ == "date"