diff --git a/changelog.md b/changelog.md index 064ea3874..11df722ed 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,10 @@ ## Unreleased +### Added + +- New `to_duration` method to convert an absolute date into a date relative to the note_datetime (or None) + ### Changes - Score / disorders / behaviors entities now have a hardcoded label, instead of being dynamically set from the component name. The following scores may have a different name than the current one in your pipelines: @@ -18,6 +22,7 @@ - the "relative" / "absolute" / "duration" mode of the time entity is now stored in the `mode` attribute of the `span._.date/duration` - the "from" / "until" period bound, if any, is now stored in the `span._.date.bound` attribute +- `to_datetime` now only return absolute dates, converts relative dates into absolute if `doc._.note_datetime` is given, and None otherwise ### Fixed - `export_to_brat` issue with spans of entities on multiple lines. diff --git a/docs/pipelines/misc/dates.md b/docs/pipelines/misc/dates.md index 09555a1e0..aeb39f9c2 100644 --- a/docs/pipelines/misc/dates.md +++ b/docs/pipelines/misc/dates.md @@ -35,27 +35,30 @@ doc = nlp(text) dates = doc.spans["dates"] dates -# Out: [23 août 2021, il y a un an, pendant une semaine, mai 1995] +# Out: [23 août 2021, il y a un an, mai 1995] dates[0]._.date.to_datetime() # Out: 2021-08-23T00:00:00+02:00 dates[1]._.date.to_datetime() -# Out: -1 year +# Out: None note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris") dates[1]._.date.to_datetime(note_datetime=note_datetime) -# Out: DateTime(2020, 8, 27, 0, 0, 0, tzinfo=Timezone('Europe/Paris')) +# Out: 2020-08-27T00:00:00+02:00 -date_3_output = dates[3]._.date.to_datetime( +date_2_output = dates[2]._.date.to_datetime( note_datetime=note_datetime, infer_from_context=True, tz="Europe/Paris", default_day=15, ) -date_3_output -# Out: DateTime(1995, 5, 15, 0, 0, 0, tzinfo=Timezone('Europe/Paris')) +date_2_output +# Out: 1995-05-15T00:00:00+02:00 + +doc.spans["durations"] +# Out: [pendant une semaine] ``` ## Declared extensions @@ -66,17 +69,9 @@ The `eds.dates` pipeline declares one [spaCy extension](https://spacy.io/usage/p The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -|------------------|--------------------------------------------------|-----------------------------------| -| `absolute` | Absolute date patterns, eg `le 5 août 2020` | `None` (use pre-defined patterns) | -| `relative` | Relative date patterns, eg `hier`) | `None` (use pre-defined patterns) | -| `durations` | Duration patterns, eg `pendant trois mois`) | `None` (use pre-defined patterns) | -| `false_positive` | Some false positive patterns to exclude | `None` (use pre-defined patterns) | -| `detect_periods` | Whether to look for periods | `False` | -| `detect_time` | Whether to look for time around dates | `True` | -| `on_ents_only` | Whether to look for dates around entities only | `False` | -| `as_ents` | Whether to save detected dates as entities | `False` | -| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` | +::: edsnlp.pipelines.misc.dates.factory.create_component + options: + only_parameters: true ## Authors and citation diff --git a/docs/pipelines/qualifiers/history.md b/docs/pipelines/qualifiers/history.md index 00cce32e7..1728b97ab 100644 --- a/docs/pipelines/qualifiers/history.md +++ b/docs/pipelines/qualifiers/history.md @@ -80,18 +80,9 @@ doc.ents[3]._.history # (2) The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| -------------------- | -------------------------------------------------------------------------------------------------------------------- | --------------------------------- | -| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` | -| `history` | History patterns | `None` (use pre-defined patterns) | -| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) | -| `use_sections` | Whether to use pre-annotated sections (requires the `sections` pipeline) | `False` | -| `use_dates` | Whether to use dates pipeline (requires the `dates` pipeline and ``note_datetime`` context is recommended) | `False` | -| `history_limit` | If `use_dates = True`. The number of days after which the event is considered as history. | `14` (2 weeks) | -| `exclude_birthdate` | If `use_dates = True`. Whether to exclude the birth date from history dates. | `True` | -| `closest_dates_only` | If `use_dates = True`. Whether to include the closest dates only. If `False`, it includes all dates in the sentence. | `True` | -| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` | -| `explain` | Whether to keep track of the cues for each entity | `False` | +::: edsnlp.pipelines.qualifiers.history.factory.create_component + options: + only_parameters: true ## Declared extensions diff --git a/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py b/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py index d3e65028d..96d58c4a5 100644 --- a/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py +++ b/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py @@ -49,6 +49,7 @@ def __init__( town_mention: Union[List[str], bool], document_date_mention: Union[List[str], bool], attr: str, + name: str = "eds.consultation_dates", **kwargs, ): diff --git a/edsnlp/pipelines/misc/consultation_dates/factory.py b/edsnlp/pipelines/misc/consultation_dates/factory.py index 53d334139..a2dca2f01 100644 --- a/edsnlp/pipelines/misc/consultation_dates/factory.py +++ b/edsnlp/pipelines/misc/consultation_dates/factory.py @@ -34,6 +34,7 @@ def create_component( ): return ConsultationDates( nlp, + name=name, attr=attr, consultation_mention=consultation_mention, document_date_mention=document_date_mention, diff --git a/edsnlp/pipelines/misc/dates/dates.py b/edsnlp/pipelines/misc/dates/dates.py index 1c1ebaf2e..93f751d90 100644 --- a/edsnlp/pipelines/misc/dates/dates.py +++ b/edsnlp/pipelines/misc/dates/dates.py @@ -49,6 +49,8 @@ class Dates(BaseComponent): each entity in `#!python doc.spans[key]` detect_periods : bool Whether to detect periods (experimental) + detect_time: bool + Whether to detect time inside dates as_ents : bool Whether to treat dates as entities attr : str @@ -68,8 +70,10 @@ def __init__( detect_time: bool, as_ents: bool, attr: str, + name: str = "eds.dates", ): self.nlp = nlp + self.name = name if absolute is None: if detect_time: @@ -170,7 +174,7 @@ def parse( self, matches: List[Tuple[Span, Dict[str, str]]] ) -> Tuple[List[Span], List[Span]]: """ - Parse dates using the groupdict returned by the matcher. + Parse dates/durations using the groupdict returned by the matcher. Parameters ---------- @@ -184,29 +188,21 @@ def parse( List of processed spans, with the date parsed. """ - dates = [] - durations = [] for span, groupdict in matches: if span.label_ == "relative": parsed = RelativeDate.parse_obj(groupdict) span.label_ = "date" span._.date = parsed - dates.append(span) - print("SPAN", span, parsed.dict()) elif span.label_ == "absolute": parsed = AbsoluteDate.parse_obj(groupdict) span.label_ = "date" span._.date = parsed - dates.append(span) - print("SPAN", span, parsed.dict()) else: parsed = Duration.parse_obj(groupdict) span.label_ = "duration" span._.duration = parsed - durations.append(span) - print("SPAN", span, parsed.dict()) - return dates, durations + return [span for span, _ in matches] def process_periods(self, dates: List[Span]) -> List[Span]: """ @@ -283,17 +279,17 @@ def __call__(self, doc: Doc) -> Doc: spaCy Doc object, annotated for dates """ matches = self.process(doc) - dates, durations = self.parse(matches) + matches = self.parse(matches) - doc.spans["dates"] = dates - doc.spans["durations"] = durations + doc.spans["dates"] = [d for d in matches if d.label_ != "duration"] + doc.spans["durations"] = [d for d in matches if d.label_ == "duration"] if self.detect_periods: - doc.spans["periods"] = self.process_periods(dates + durations) + doc.spans["periods"] = self.process_periods(matches) if self.as_ents: ents, discarded = filter_spans( - list(doc.ents) + dates + durations, return_discarded=True + list(doc.ents) + matches, return_discarded=True ) doc.ents = ents diff --git a/edsnlp/pipelines/misc/dates/factory.py b/edsnlp/pipelines/misc/dates/factory.py index ec2f71022..9531bc5b8 100644 --- a/edsnlp/pipelines/misc/dates/factory.py +++ b/edsnlp/pipelines/misc/dates/factory.py @@ -25,19 +25,58 @@ @Language.factory("eds.dates", default_config=DEFAULT_CONFIG, assigns=["doc.spans"]) def create_component( nlp: Language, - name: str, - absolute: Optional[List[str]], - relative: Optional[List[str]], - duration: Optional[List[str]], - false_positive: Optional[List[str]], - on_ents_only: Union[bool, str, List[str], Set[str]], - detect_periods: bool, - detect_time: bool, - as_ents: bool, - attr: str, + name: str = "eds.dates", + absolute: Optional[List[str]] = None, + relative: Optional[List[str]] = None, + duration: Optional[List[str]] = None, + false_positive: Optional[List[str]] = None, + on_ents_only: Union[bool, str, List[str], Set[str]] = False, + detect_periods: bool = False, + detect_time: bool = True, + as_ents: bool = False, + attr: str = "LOWER", ): + """ + Tags and normalizes dates, using the open-source `dateparser` library. + + The pipeline uses spaCy's `filter_spans` function. + It filters out false positives, and introduce a hierarchy between patterns. + For instance, in case of ambiguity, the pipeline will decide that a date is a + date without a year rather than a date without a day. + + Parameters + ---------- + nlp : spacy.language.Language + Language pipeline object + absolute : Union[List[str], str] + List of regular expressions for absolute dates. + relative : Union[List[str], str] + List of regular expressions for relative dates + (eg `hier`, `la semaine prochaine`). + duration : Union[List[str], str] + List of regular expressions for durations + (eg `pendant trois mois`). + false_positive : Union[List[str], str] + List of regular expressions for false positive (eg phone numbers, etc). + on_ents_only : Union[bool, str, List[str]] + Whether to look on dates in the whole document or in specific sentences: + + - If `True`: Only look in the sentences of each entity in doc.ents + - If False: Look in the whole document + - If given a string `key` or list of string: Only look in the sentences of + each entity in `#!python doc.spans[key]` + detect_periods : bool + Whether to detect periods (experimental) + detect_time: bool + Whether to detect time inside dates + as_ents : bool + Whether to treat dates as entities + attr : str + spaCy attribute to use + """ return Dates( nlp, + name=name, absolute=absolute, relative=relative, duration=duration, diff --git a/edsnlp/pipelines/misc/dates/models.py b/edsnlp/pipelines/misc/dates/models.py index 46037dd83..845803e1b 100644 --- a/edsnlp/pipelines/misc/dates/models.py +++ b/edsnlp/pipelines/misc/dates/models.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime from enum import Enum from typing import Dict, Optional, Union @@ -73,12 +73,43 @@ class AbsoluteDate(BaseDate): def to_datetime( self, tz: Union[str, pendulum.tz.timezone] = "Europe/Paris", - note_datetime: Optional[datetime] = None, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, infer_from_context: bool = False, default_day=1, default_month=1, **kwargs, ) -> Optional[pendulum.datetime]: + """ + Convert the date to a pendulum.datetime object. + + Parameters + ---------- + tz : Optional[Union[str, pendulum.tz.timezone]] + The timezone to use. Defaults to "Europe/Paris". + note_datetime : Optional[Union[pendulum.datetime, datetime.datetime]] + The datetime of the note. Used to infer missing parts of the date. + infer_from_context : bool + Whether to infer missing parts of the date from the note datetime. + In a (year, month, day) triplet: + + - if only year is missing, it will be inferred from the note datetime + - if only month is missing, it will be inferred from the note datetime + - if only day is missing, it will be set to `default_day` + - if only the year is given, the day and month will be set to + `default_day` and `default_month` + - if only the month is given, the day will be set to `default_day` + and the year will be inferred from the note datetime + - if only the day is given, the month and year will be inferred from + the note datetime + default_day : int + Default day to use when inferring missing parts of the date. + default_month : int + Default month to use when inferring missing parts of the date. + + Returns + ------- + Optional[pendulum.datetime] + """ d = self.dict(exclude_none=True) d.pop("mode", None) @@ -90,42 +121,46 @@ def to_datetime( return None elif infer_from_context: - # no year - if ( - not self.year - and self.month - and self.day - and note_datetime - and not isinstance(note_datetime, NaTType) - ): - d["year"] = note_datetime.year - return pendulum.datetime(**d, tz=tz) + if note_datetime and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + + if self.year is None: + d["year"] = note_datetime.year + if self.month is None: + if self.day is None: + d["month"] = default_month + else: + d["month"] = note_datetime.month + if self.day is None: + d["day"] = default_day + else: + if self.year is None: + return None + if self.month is None: + d["month"] = default_month + if self.day is None: + d["day"] = default_day - # no day - elif self.year and self.month and not self.day: - d["day"] = default_day + try: return pendulum.datetime(**d, tz=tz) + except ValueError: + return None - # year only - elif self.year and not self.month and not self.day: - d["day"] = default_day - d["month"] = default_month - return pendulum.datetime(**d, tz=tz) + return None - # month only - elif ( - not self.year - and self.month - and not self.day - and note_datetime - and not isinstance(note_datetime, NaTType) - ): - d["day"] = default_day - d["year"] = note_datetime.year - return pendulum.datetime(**d, tz=tz) - return None + def to_duration( + self, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, + **kwargs, + ) -> Optional[pendulum.Duration]: - return None + if note_datetime and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + dt = self.to_datetime(note_datetime=note_datetime, **kwargs) + delta = dt.diff(note_datetime) + return delta.as_interval() + else: + return None def norm(self) -> str: @@ -191,31 +226,51 @@ def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]: return d - -class RelativeDate(Relative): - direction: Direction = Direction.CURRENT - - def to_datetime( - self, - note_datetime: Optional[datetime] = None, - **kwargs, - ) -> pendulum.Duration: + def to_duration(self, **kwargs) -> pendulum.Duration: d = self.dict(exclude_none=True) direction = d.pop("direction", None) dir = -1 if direction == Direction.PAST else 1 d.pop("mode", None) - d.pop("bound", None) d = {f"{k}s": v for k, v in d.items()} td = dir * pendulum.duration(**d) + return td + + def to_datetime(self, **kwargs) -> Optional[pendulum.datetime]: + # for compatibility + return None + + +class RelativeDate(Relative): + direction: Direction = Direction.CURRENT + + def to_datetime( + self, + note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None, + **kwargs, + ) -> Optional[pendulum.datetime]: if note_datetime is not None and not isinstance(note_datetime, NaTType): + note_datetime = pendulum.instance(note_datetime) + + d = self.dict(exclude_none=True) + + direction = d.pop("direction", None) + dir = -1 if direction == Direction.PAST else 1 + + d.pop("mode", None) + d.pop("bound", None) + + d = {f"{k}s": v for k, v in d.items()} + + td = dir * pendulum.duration(**d) + return note_datetime + td - return td + return None def norm(self) -> str: @@ -228,7 +283,7 @@ def norm(self) -> str: norm = f"~0 {key}" else: - td = self.to_datetime() + td = self.to_duration() norm = str(td) if td.in_seconds() > 0: norm = f"+{norm}" @@ -266,10 +321,10 @@ class Duration(Relative): def norm(self) -> str: - td = self.to_datetime() + td = self.to_duration() return f"during {td}" - def to_datetime(self, **kwargs) -> pendulum.Duration: + def to_duration(self, **kwargs) -> pendulum.Duration: d = self.dict(exclude_none=True) d = {f"{k}s": v for k, v in d.items() if k not in ("mode", "bound")} diff --git a/edsnlp/pipelines/qualifiers/history/factory.py b/edsnlp/pipelines/qualifiers/history/factory.py index 28b2641dd..1b819aab9 100644 --- a/edsnlp/pipelines/qualifiers/history/factory.py +++ b/edsnlp/pipelines/qualifiers/history/factory.py @@ -3,13 +3,13 @@ from spacy.language import Language from edsnlp.pipelines.qualifiers.history import History, patterns -from edsnlp.pipelines.terminations import termination +from edsnlp.pipelines.terminations import termination as termination_patterns from edsnlp.utils.deprecation import deprecated_factory DEFAULT_CONFIG = dict( attr="NORM", history=patterns.history, - termination=termination, + termination=termination_patterns, use_sections=False, use_dates=False, history_limit=14, @@ -45,18 +45,54 @@ ) def create_component( nlp: Language, - name: str, - history: Optional[List[str]], - termination: Optional[List[str]], - use_sections: bool, - use_dates: bool, - history_limit: int, - exclude_birthdate: bool, - closest_dates_only: bool, - attr: str, - explain: bool, - on_ents_only: Union[bool, str, List[str], Set[str]], + name: str = "eds.history", + history: Optional[List[str]] = patterns.history, + termination: Optional[List[str]] = termination_patterns, + use_sections: bool = False, + use_dates: bool = False, + history_limit: int = 14, + exclude_birthdate: bool = True, + closest_dates_only: bool = True, + attr: str = "NORM", + explain: bool = False, + on_ents_only: Union[bool, str, List[str], Set[str]] = True, ): + """ + Implements a history detection algorithm. + + The component looks for terms indicating history in the text. + + Parameters + ---------- + nlp : Language + spaCy nlp pipeline to use for matching. + name : str + Name of the component. + history : Optional[List[str]] + List of terms indicating medical history reference. + termination : Optional[List[str]] + List of syntagms termination terms. + use_sections : bool + Whether to use section pipeline to detect medical history section. + use_dates : bool + Whether to use dates pipeline to detect if the event occurs + a long time before the document date. + history_limit : int + The number of days after which the event is considered as history. + exclude_birthdate : bool + Whether to exclude the birthdate from history dates. + closest_dates_only : bool + Whether to include the closest dates only. + attr : str + spaCy's attribute to use: + a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr' + we can also add a key for each regex. + on_ents_only : bool + Whether to look for matches around detected entities only. + Useful for faster inference in downstream tasks. + explain : bool + Whether to keep track of cues for each entity. + """ return History( nlp, attr=attr, diff --git a/edsnlp/pipelines/qualifiers/history/history.py b/edsnlp/pipelines/qualifiers/history/history.py index d1dda0ccb..7a1450e39 100644 --- a/edsnlp/pipelines/qualifiers/history/history.py +++ b/edsnlp/pipelines/qualifiers/history/history.py @@ -27,6 +27,8 @@ class History(Qualifier): spaCy nlp pipeline to use for matching. history : Optional[List[str]] List of terms indicating medical history reference. + termination : Optional[List[str]] + List of syntagms termination terms. use_sections : bool Whether to use section pipeline to detect medical history section. use_dates : bool @@ -35,7 +37,7 @@ class History(Qualifier): history_limit : int The number of days after which the event is considered as history. exclude_birthdate : bool - Whether to exclude the birth date from history dates. + Whether to exclude the birthdate from history dates. closest_dates_only : bool Whether to include the closest dates only. attr : str @@ -280,7 +282,7 @@ def process(self, doc: Doc) -> Doc: for date in doc.spans["dates"]: value = date._.date if value.mode == "relative": - if value.direction.value == "CURRENT": + if value.direction.value == "current": if ( (value.year == 0 and self.history_limit >= timedelta(365)) or ( @@ -293,7 +295,15 @@ def process(self, doc: Doc) -> Doc: Span(doc, date.start, date.end, label="relative_date") ) elif value.direction.value == "past": - if -value.to_datetime() >= self.history_limit: + if ( + -value.to_duration( + note_datetime=doc._.note_datetime, + infer_from_context=True, + tz="Europe/Paris", + default_day=15, + ) + >= self.history_limit + ): history_dates.append( Span(doc, date.start, date.end, label="relative_date") ) diff --git a/tests/pipelines/misc/test_dates.py b/tests/pipelines/misc/test_dates.py index 5485404e1..eb755d5c9 100644 --- a/tests/pipelines/misc/test_dates.py +++ b/tests/pipelines/misc/test_dates.py @@ -6,7 +6,7 @@ from pytest import fixture from spacy.language import Language -from edsnlp.pipelines.misc.dates.models import AbsoluteDate +from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Relative from edsnlp.utils.examples import parse_example TZ = pytz.timezone("Europe/Paris") @@ -88,7 +88,7 @@ def test_dates_component(blank_nlp: Language): assert len(spans) == len(entities) assert len(doc.ents) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip(spans, entities): assert span.text == text[entity.start_char : entity.end_char] date = span._.date if span.label_ == "date" else span._.duration @@ -145,8 +145,11 @@ def test_dates_component(blank_nlp: Language): note_datetime=note_datetime, infer_from_context=True ) == TZ.localize(datetime(**d)) + elif isinstance(date, Relative): + assert date.to_datetime() is None else: - assert date.to_datetime() is not None + assert date.to_duration() + assert date.to_datetime(note_datetime=note_datetime) def test_periods(blank_nlp: Language): @@ -186,9 +189,11 @@ def test_time(with_time: bool): doc = nlp(text) - assert len(doc.spans["dates"]) == len(entities) + spans = sorted(doc.spans["dates"] + doc.spans["durations"]) - for span, entity in zip(doc.spans["dates"], entities): + assert len(spans) == len(entities) + + for span, entity in zip(spans, entities): assert span.text == text[entity.start_char : entity.end_char] norm = next(m.value for m in entity.modifiers if m.key == "norm") assert span._.date.norm() == norm @@ -216,7 +221,7 @@ def test_false_positives(blank_nlp: Language): for example in counter_examples: doc = blank_nlp(example) - assert len(doc.spans["dates"]) == 0 + assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0 def test_dates_on_ents_only(): @@ -239,9 +244,11 @@ def test_dates_on_ents_only(): assert len(doc.ents) == 1 - assert len(doc.spans["dates"]) == len(entities) + spans = sorted(doc.spans["dates"] + doc.spans["durations"]) + + assert len(spans) == len(entities) - for span, entity in zip(doc.spans["dates"], entities): + for span, entity in zip(spans, entities): assert span.text == text[entity.start_char : entity.end_char] @@ -252,5 +259,5 @@ def test_illegal_dates(blank_nlp): ) for text in texts: doc = blank_nlp(text) - ent = doc.spans["dates"][0] + ent = sorted((*doc.spans["dates"], *doc.spans["durations"]))[0] assert ent._.date.to_datetime() is None