diff --git a/changelog.md b/changelog.md index b0e85e39c..c4ba0b6ac 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,17 @@ # Changelog +## Unreleased + +### Added + +- New `to_duration` method to convert an absolute date into a date relative to the note_datetime (or None) +- New `use_date_label` in `eds.dates` to store absolute and relative dates under a same `date` label (instead of `absolute` and `relative`) + +### Changed + +- Duration time entities (from `eds.dates`) are now stored in the `durations` span group, different than the `dates` span group +- `to_datetime` now only return absolute dates, converts relative dates into absolute if `doc._.note_datetime` is given, and None otherwise + ## v0.8.0 (2023-03-09) ### Added diff --git a/docs/pipelines/misc/dates.md b/docs/pipelines/misc/dates.md index 09555a1e0..aeb39f9c2 100644 --- a/docs/pipelines/misc/dates.md +++ b/docs/pipelines/misc/dates.md @@ -35,27 +35,30 @@ doc = nlp(text) dates = doc.spans["dates"] dates -# Out: [23 août 2021, il y a un an, pendant une semaine, mai 1995] +# Out: [23 août 2021, il y a un an, mai 1995] dates[0]._.date.to_datetime() # Out: 2021-08-23T00:00:00+02:00 dates[1]._.date.to_datetime() -# Out: -1 year +# Out: None note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris") dates[1]._.date.to_datetime(note_datetime=note_datetime) -# Out: DateTime(2020, 8, 27, 0, 0, 0, tzinfo=Timezone('Europe/Paris')) +# Out: 2020-08-27T00:00:00+02:00 -date_3_output = dates[3]._.date.to_datetime( +date_2_output = dates[2]._.date.to_datetime( note_datetime=note_datetime, infer_from_context=True, tz="Europe/Paris", default_day=15, ) -date_3_output -# Out: DateTime(1995, 5, 15, 0, 0, 0, tzinfo=Timezone('Europe/Paris')) +date_2_output +# Out: 1995-05-15T00:00:00+02:00 + +doc.spans["durations"] +# Out: [pendant une semaine] ``` ## Declared extensions @@ -66,17 +69,9 @@ The `eds.dates` pipeline declares one [spaCy extension](https://spacy.io/usage/p The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -|------------------|--------------------------------------------------|-----------------------------------| -| `absolute` | Absolute date patterns, eg `le 5 août 2020` | `None` (use pre-defined patterns) | -| `relative` | Relative date patterns, eg `hier`) | `None` (use pre-defined patterns) | -| `durations` | Duration patterns, eg `pendant trois mois`) | `None` (use pre-defined patterns) | -| `false_positive` | Some false positive patterns to exclude | `None` (use pre-defined patterns) | -| `detect_periods` | Whether to look for periods | `False` | -| `detect_time` | Whether to look for time around dates | `True` | -| `on_ents_only` | Whether to look for dates around entities only | `False` | -| `as_ents` | Whether to save detected dates as entities | `False` | -| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` | +::: edsnlp.pipelines.misc.dates.factory.create_component + options: + only_parameters: true ## Authors and citation diff --git a/docs/pipelines/qualifiers/history.md b/docs/pipelines/qualifiers/history.md index 00cce32e7..1728b97ab 100644 --- a/docs/pipelines/qualifiers/history.md +++ b/docs/pipelines/qualifiers/history.md @@ -80,18 +80,9 @@ doc.ents[3]._.history # (2) The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| -------------------- | -------------------------------------------------------------------------------------------------------------------- | --------------------------------- | -| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` | -| `history` | History patterns | `None` (use pre-defined patterns) | -| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) | -| `use_sections` | Whether to use pre-annotated sections (requires the `sections` pipeline) | `False` | -| `use_dates` | Whether to use dates pipeline (requires the `dates` pipeline and ``note_datetime`` context is recommended) | `False` | -| `history_limit` | If `use_dates = True`. The number of days after which the event is considered as history. | `14` (2 weeks) | -| `exclude_birthdate` | If `use_dates = True`. Whether to exclude the birth date from history dates. | `True` | -| `closest_dates_only` | If `use_dates = True`. Whether to include the closest dates only. If `False`, it includes all dates in the sentence. | `True` | -| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` | -| `explain` | Whether to keep track of the cues for each entity | `False` | +::: edsnlp.pipelines.qualifiers.history.factory.create_component + options: + only_parameters: true ## Declared extensions diff --git a/edsnlp/pipelines/misc/dates/dates.py b/edsnlp/pipelines/misc/dates/dates.py index d6329e45a..ef8768231 100644 --- a/edsnlp/pipelines/misc/dates/dates.py +++ b/edsnlp/pipelines/misc/dates/dates.py @@ -41,7 +41,7 @@ class Dates(BaseComponent): false_positive : Union[List[str], str] List of regular expressions for false positive (eg phone numbers, etc). on_ents_only : Union[bool, str, List[str]] - Wether to look on dates in the whole document or in specific sentences: + Whether to look on dates in the whole document or in specific sentences: - If `True`: Only look in the sentences of each entity in doc.ents - If False: Look in the whole document @@ -49,10 +49,15 @@ class Dates(BaseComponent): each entity in `#!python doc.spans[key]` detect_periods : bool Whether to detect periods (experimental) + detect_time: bool + Whether to detect time inside dates as_ents : bool Whether to treat dates as entities attr : str spaCy attribute to use + use_date_label: bool + Whether to use a shared `date` label for absolute and relative dates + instead of `absolute` and `relative` labels """ # noinspection PyProtectedMember @@ -68,8 +73,10 @@ def __init__( detect_time: bool, as_ents: bool, attr: str, + use_date_label: bool = False, ): + self.use_date_label = use_date_label self.nlp = nlp if absolute is None: @@ -193,8 +200,12 @@ def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]: for span, groupdict in dates: if span.label_ == "relative": parsed = RelativeDate.parse_obj(groupdict) + if self.use_date_label: + span.label_ = "date" elif span.label_ == "absolute": parsed = AbsoluteDate.parse_obj(groupdict) + if self.use_date_label: + span.label_ = "date" else: parsed = Duration.parse_obj(groupdict) @@ -275,7 +286,8 @@ def __call__(self, doc: Doc) -> Doc: dates = self.process(doc) dates = self.parse(dates) - doc.spans["dates"] = dates + doc.spans["dates"] = [d for d in dates if d.label_ != "duration"] + doc.spans["durations"] = [d for d in dates if d.label_ == "duration"] if self.detect_periods: doc.spans["periods"] = self.process_periods(dates) diff --git a/edsnlp/pipelines/misc/dates/factory.py b/edsnlp/pipelines/misc/dates/factory.py index db1953afd..20b84fe7d 100644 --- a/edsnlp/pipelines/misc/dates/factory.py +++ b/edsnlp/pipelines/misc/dates/factory.py @@ -25,17 +25,59 @@ @Language.factory("eds.dates", default_config=DEFAULT_CONFIG, assigns=["doc.spans"]) def create_component( nlp: Language, - name: str, - absolute: Optional[List[str]], - relative: Optional[List[str]], - duration: Optional[List[str]], - false_positive: Optional[List[str]], - on_ents_only: Union[bool, List[str]], - detect_periods: bool, - detect_time: bool, - as_ents: bool, - attr: str, + name: str = "eds.dates", + absolute: Optional[List[str]] = None, + relative: Optional[List[str]] = None, + duration: Optional[List[str]] = None, + false_positive: Optional[List[str]] = None, + on_ents_only: Union[bool, List[str]] = False, + detect_periods: bool = False, + detect_time: bool = True, + as_ents: bool = False, + attr: str = "LOWER", + use_date_label: bool = False, ): + """ + Tags and normalizes dates, using the open-source `dateparser` library. + + The pipeline uses spaCy's `filter_spans` function. + It filters out false positives, and introduce a hierarchy between patterns. + For instance, in case of ambiguity, the pipeline will decide that a date is a + date without a year rather than a date without a day. + + Parameters + ---------- + nlp : spacy.language.Language + Language pipeline object + absolute : Union[List[str], str] + List of regular expressions for absolute dates. + relative : Union[List[str], str] + List of regular expressions for relative dates + (eg `hier`, `la semaine prochaine`). + duration : Union[List[str], str] + List of regular expressions for durations + (eg `pendant trois mois`). + false_positive : Union[List[str], str] + List of regular expressions for false positive (eg phone numbers, etc). + on_ents_only : Union[bool, str, List[str]] + Whether to look on dates in the whole document or in specific sentences: + + - If `True`: Only look in the sentences of each entity in doc.ents + - If False: Look in the whole document + - If given a string `key` or list of string: Only look in the sentences of + each entity in `#!python doc.spans[key]` + detect_periods : bool + Whether to detect periods (experimental) + detect_time: bool + Whether to detect time inside dates + as_ents : bool + Whether to treat dates as entities + attr : str + spaCy attribute to use + use_date_label: bool + Whether to use a shared `date` label for absolute and relative dates + instead of `absolute` and `relative` labels + """ return Dates( nlp, absolute=absolute, @@ -47,4 +89,5 @@ def create_component( detect_time=detect_time, as_ents=as_ents, attr=attr, + use_date_label=use_date_label, ) diff --git a/edsnlp/pipelines/misc/dates/models.py b/edsnlp/pipelines/misc/dates/models.py index f676c8060..a3b941190 100644 --- a/edsnlp/pipelines/misc/dates/models.py +++ b/edsnlp/pipelines/misc/dates/models.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime from enum import Enum from typing import Dict, Optional, Union @@ -68,12 +68,43 @@ class AbsoluteDate(BaseDate): def to_datetime( self, tz: Union[str, pendulum.tz.timezone] = "Europe/Paris", - note_datetime: Optional[datetime] = None, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, infer_from_context: bool = False, default_day=1, default_month=1, **kwargs, ) -> Optional[pendulum.datetime]: + """ + Convert the date to a pendulum.datetime object. + + Parameters + ---------- + tz : Optional[Union[str, pendulum.tz.timezone]] + The timezone to use. Defaults to "Europe/Paris". + note_datetime : Optional[Union[pendulum.datetime, datetime.datetime]] + The datetime of the note. Used to infer missing parts of the date. + infer_from_context : bool + Whether to infer missing parts of the date from the note datetime. + In a (year, month, day) triplet: + + - if only year is missing, it will be inferred from the note datetime + - if only month is missing, it will be inferred from the note datetime + - if only day is missing, it will be set to `default_day` + - if only the year is given, the day and month will be set to + `default_day` and `default_month` + - if only the month is given, the day will be set to `default_day` + and the year will be inferred from the note datetime + - if only the day is given, the month and year will be inferred from + the note datetime + default_day : int + Default day to use when inferring missing parts of the date. + default_month : int + Default month to use when inferring missing parts of the date. + + Returns + ------- + Optional[pendulum.datetime] + """ d = self.dict(exclude_none=True) d.pop("mode", None) @@ -84,43 +115,48 @@ def to_datetime( return None elif infer_from_context: - # no year - if ( - not self.year - and self.month - and self.day - and note_datetime - and not isinstance(note_datetime, NaTType) - ): - d["year"] = note_datetime.year - return pendulum.datetime(**d, tz=tz) - # no day - elif self.year and self.month and not self.day: - d["day"] = default_day - return pendulum.datetime(**d, tz=tz) + if note_datetime and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + + if self.year is None: + d["year"] = note_datetime.year + if self.month is None: + if self.day is None: + d["month"] = default_month + else: + d["month"] = note_datetime.month + if self.day is None: + d["day"] = default_day + else: + if self.year is None: + return None + if self.month is None: + d["month"] = default_month + if self.day is None: + d["day"] = default_day - # year only - elif self.year and not self.month and not self.day: - d["day"] = default_day - d["month"] = default_month - return pendulum.datetime(**d, tz=tz) - - # month only - elif ( - not self.year - and self.month - and not self.day - and note_datetime - and not isinstance(note_datetime, NaTType) - ): - d["day"] = default_day - d["year"] = note_datetime.year + try: return pendulum.datetime(**d, tz=tz) - return None + except ValueError: + return None return None + def to_duration( + self, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, + **kwargs, + ) -> Optional[pendulum.Duration]: + + if note_datetime and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + dt = self.to_datetime(note_datetime=note_datetime, **kwargs) + delta = dt.diff(note_datetime) + return delta.as_interval() + else: + return None + def norm(self) -> str: year = str(self.year) if self.year else "????" @@ -184,7 +220,7 @@ def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]: return d - def to_datetime(self, **kwargs) -> pendulum.Duration: + def to_duration(self, **kwargs) -> pendulum.Duration: d = self.dict(exclude_none=True) direction = d.pop("direction", None) @@ -197,20 +233,29 @@ def to_datetime(self, **kwargs) -> pendulum.Duration: td = dir * pendulum.duration(**d) return td + def to_datetime(self, **kwargs) -> Optional[pendulum.datetime]: + # for compatibility + return None + class RelativeDate(Relative): direction: Direction = Direction.CURRENT def to_datetime( self, - note_datetime: Optional[datetime] = None, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, **kwargs, - ) -> pendulum.Duration: - td = super(RelativeDate, self).to_datetime() + ) -> Optional[pendulum.datetime]: if note_datetime is not None and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + td = super(RelativeDate, self).to_duration() return note_datetime + td + return None + + def to_duration(self, **kwargs) -> pendulum.Duration: + td = super(RelativeDate, self).to_duration() return td def norm(self) -> str: @@ -224,7 +269,7 @@ def norm(self) -> str: norm = f"~0 {key}" else: - td = self.to_datetime() + td = self.to_duration() norm = str(td) if td.in_seconds() > 0: norm = f"+{norm}" @@ -262,5 +307,5 @@ class Duration(Relative): def norm(self) -> str: - td = self.to_datetime() + td = self.to_duration() return f"during {td}" diff --git a/edsnlp/pipelines/qualifiers/history/factory.py b/edsnlp/pipelines/qualifiers/history/factory.py index 2cd18e37c..e3b6a9093 100644 --- a/edsnlp/pipelines/qualifiers/history/factory.py +++ b/edsnlp/pipelines/qualifiers/history/factory.py @@ -3,13 +3,13 @@ from spacy.language import Language from edsnlp.pipelines.qualifiers.history import History, patterns -from edsnlp.pipelines.terminations import termination +from edsnlp.pipelines.terminations import termination as termination_patterns from edsnlp.utils.deprecation import deprecated_factory DEFAULT_CONFIG = dict( attr="NORM", history=patterns.history, - termination=termination, + termination=termination_patterns, use_sections=False, use_dates=False, history_limit=14, @@ -45,18 +45,54 @@ ) def create_component( nlp: Language, - name: str, - history: Optional[List[str]], - termination: Optional[List[str]], - use_sections: bool, - use_dates: bool, - history_limit: int, - exclude_birthdate: bool, - closest_dates_only: bool, - attr: str, - explain: bool, - on_ents_only: bool, + name: str = "eds.history", + history: Optional[List[str]] = patterns.history, + termination: Optional[List[str]] = termination_patterns, + use_sections: bool = False, + use_dates: bool = False, + history_limit: int = 14, + exclude_birthdate: bool = True, + closest_dates_only: bool = True, + attr: str = "NORM", + explain: bool = False, + on_ents_only: bool = True, ): + """ + Implements a history detection algorithm. + + The component looks for terms indicating history in the text. + + Parameters + ---------- + nlp : Language + spaCy nlp pipeline to use for matching. + name : str + Name of the component. + history : Optional[List[str]] + List of terms indicating medical history reference. + termination : Optional[List[str]] + List of syntagms termination terms. + use_sections : bool + Whether to use section pipeline to detect medical history section. + use_dates : bool + Whether to use dates pipeline to detect if the event occurs + a long time before the document date. + history_limit : int + The number of days after which the event is considered as history. + exclude_birthdate : bool + Whether to exclude the birthdate from history dates. + closest_dates_only : bool + Whether to include the closest dates only. + attr : str + spaCy's attribute to use: + a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' + we can also add a key for each regex. + on_ents_only : bool + Whether to look for matches around detected entities only. + Useful for faster inference in downstream tasks. + explain : bool + Whether to keep track of cues for each entity. + """ return History( nlp, attr=attr, diff --git a/edsnlp/pipelines/qualifiers/history/history.py b/edsnlp/pipelines/qualifiers/history/history.py index 6c0fa93c9..a6aa487bc 100644 --- a/edsnlp/pipelines/qualifiers/history/history.py +++ b/edsnlp/pipelines/qualifiers/history/history.py @@ -17,9 +17,9 @@ class History(Qualifier): """ - Implements an history detection algorithm. + Implements a history detection algorithm. - The components looks for terms indicating history in the text. + The component looks for terms indicating history in the text. Parameters ---------- @@ -28,7 +28,7 @@ class History(Qualifier): history : Optional[List[str]] List of terms indicating medical history reference. termination : Optional[List[str]] - List of syntagme termination terms. + List of syntagms termination terms. use_sections : bool Whether to use section pipeline to detect medical history section. use_dates : bool @@ -37,7 +37,7 @@ class History(Qualifier): history_limit : int The number of days after which the event is considered as history. exclude_birthdate : bool - Whether to exclude the birth date from history dates. + Whether to exclude the birthdate from history dates. closest_dates_only : bool Whether to include the closest dates only. attr : str @@ -47,8 +47,6 @@ class History(Qualifier): on_ents_only : bool Whether to look for matches around detected entities only. Useful for faster inference in downstream tasks. - regex : Optional[Dict[str, Union[List[str], str]]] - A dictionary of regex patterns. explain : bool Whether to keep track of cues for each entity. """ @@ -300,7 +298,15 @@ def process(self, doc: Doc) -> Doc: Span(doc, date.start, date.end, label="relative_date") ) elif date._.date.direction.value == "PAST": - if -date._.date.to_datetime() >= self.history_limit: + if ( + -date._.date.to_duration( + note_datetime=doc._.note_datetime, + infer_from_context=True, + tz="Europe/Paris", + default_day=15, + ) + >= self.history_limit + ): history_dates.append( Span(doc, date.start, date.end, label="relative_date") ) diff --git a/tests/pipelines/misc/test_dates.py b/tests/pipelines/misc/test_dates.py index 54f0057a8..5e055fdf9 100644 --- a/tests/pipelines/misc/test_dates.py +++ b/tests/pipelines/misc/test_dates.py @@ -6,7 +6,7 @@ from pytest import fixture from spacy.language import Language -from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Direction, Mode +from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Direction, Mode, Relative from edsnlp.utils.examples import parse_example TZ = pytz.timezone("Europe/Paris") @@ -84,10 +84,12 @@ def test_dates_component(blank_nlp: Language): doc = blank_nlp(text) - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) assert len(doc.ents) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip( + (*doc.spans["dates"], *doc.spans["durations"]), entities + ): assert span.text == text[entity.start_char : entity.end_char] date = span._.date @@ -183,8 +185,11 @@ def test_dates_component(blank_nlp: Language): note_datetime=note_datetime, infer_from_context=True ) == TZ.localize(datetime(**d)) + elif isinstance(date, Relative): + assert date.to_datetime() is None else: - assert date.to_datetime() is not None + assert date.to_duration() + assert date.to_datetime(note_datetime=note_datetime) def test_periods(blank_nlp: Language): @@ -224,9 +229,11 @@ def test_time(with_time: bool): doc = nlp(text) - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip( + (*doc.spans["dates"], *doc.spans["durations"]), entities + ): assert span.text == text[entity.start_char : entity.end_char] norm = next(m.value for m in entity.modifiers if m.key == "norm") assert span._.date.norm() == norm @@ -254,7 +261,7 @@ def test_false_positives(blank_nlp: Language): for example in counter_examples: doc = blank_nlp(example) - assert len(doc.spans["dates"]) == 0 + assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0 def test_dates_on_ents_only(): @@ -277,9 +284,9 @@ def test_dates_on_ents_only(): assert len(doc.ents) == 1 - assert len(doc.spans["dates"]) == len(entities) + assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip((*doc.spans["dates"], *doc.spans["durations"]), entities): assert span.text == text[entity.start_char : entity.end_char] @@ -290,5 +297,11 @@ def test_illegal_dates(blank_nlp): ) for text in texts: doc = blank_nlp(text) - ent = doc.spans["dates"][0] + ent = (*doc.spans["dates"], *doc.spans["durations"])[0] assert ent._.date.to_datetime() is None + + +def test_date_label(): + nlp = spacy.blank("eds") + nlp.add_pipe("eds.dates", config={"use_date_label": True, "as_ents": True}) + assert nlp("Le 31/06/17, la dernière dose.").ents[0].label_ == "date"