Skip to content

Commit

Permalink
feat: in eds.dates new option for shared date label and assign durati…
Browse files Browse the repository at this point in the history
…on to separate span group
  • Loading branch information
percevalw committed Mar 17, 2023
1 parent f1ed69c commit 2544edd
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 9 deletions.
12 changes: 12 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## Unreleased

### Added

- New `to_duration` method to convert an absolute date into a date relative to the note_datetime (or None)
- New `use_date_label` in `eds.dates` to store absolute and relative dates under a same `date` label (instead of `absolute` and `relative`)

### Changed

- Duration time entities (from `eds.dates`) are now stored in the `durations` span group, different than the `dates` span group
- `to_datetime` now only return absolute dates, converts relative dates into absolute if `doc._.note_datetime` is given, and None otherwise

## v0.8.0 (2023-03-09)

### Added
Expand Down
12 changes: 11 additions & 1 deletion edsnlp/pipelines/misc/dates/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class Dates(BaseComponent):
Whether to treat dates as entities
attr : str
spaCy attribute to use
use_date_label: bool
Whether to use a shared `date` label for absolute and relative dates
instead of `absolute` and `relative` labels
"""

# noinspection PyProtectedMember
Expand All @@ -70,8 +73,10 @@ def __init__(
detect_time: bool,
as_ents: bool,
attr: str,
use_date_label: bool = False,
):

self.use_date_label = use_date_label
self.nlp = nlp

if absolute is None:
Expand Down Expand Up @@ -195,8 +200,12 @@ def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
for span, groupdict in dates:
if span.label_ == "relative":
parsed = RelativeDate.parse_obj(groupdict)
if self.use_date_label:
span.label_ = "date"
elif span.label_ == "absolute":
parsed = AbsoluteDate.parse_obj(groupdict)
if self.use_date_label:
span.label_ = "date"
else:
parsed = Duration.parse_obj(groupdict)

Expand Down Expand Up @@ -277,7 +286,8 @@ def __call__(self, doc: Doc) -> Doc:
dates = self.process(doc)
dates = self.parse(dates)

doc.spans["dates"] = dates
doc.spans["dates"] = [d for d in dates if d.label_ != "duration"]
doc.spans["durations"] = [d for d in dates if d.label_ == "duration"]

if self.detect_periods:
doc.spans["periods"] = self.process_periods(dates)
Expand Down
5 changes: 5 additions & 0 deletions edsnlp/pipelines/misc/dates/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def create_component(
detect_time: bool = True,
as_ents: bool = False,
attr: str = "LOWER",
use_date_label: bool = False,
):
"""
Tags and normalizes dates, using the open-source `dateparser` library.
Expand Down Expand Up @@ -73,6 +74,9 @@ def create_component(
Whether to treat dates as entities
attr : str
spaCy attribute to use
use_date_label: bool
Whether to use a shared `date` label for absolute and relative dates
instead of `absolute` and `relative` labels
"""
return Dates(
nlp,
Expand All @@ -85,4 +89,5 @@ def create_component(
detect_time=detect_time,
as_ents=as_ents,
attr=attr,
use_date_label=use_date_label,
)
26 changes: 18 additions & 8 deletions tests/pipelines/misc/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,12 @@ def test_dates_component(blank_nlp: Language):

doc = blank_nlp(text)

assert len(doc.spans["dates"]) == len(entities)
assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities)
assert len(doc.ents) == len(entities)

for span, entity in zip(doc.spans["dates"], entities):
for span, entity in zip(
(*doc.spans["dates"], *doc.spans["durations"]), entities
):
assert span.text == text[entity.start_char : entity.end_char]

date = span._.date
Expand Down Expand Up @@ -227,9 +229,11 @@ def test_time(with_time: bool):

doc = nlp(text)

assert len(doc.spans["dates"]) == len(entities)
assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities)

for span, entity in zip(doc.spans["dates"], entities):
for span, entity in zip(
(*doc.spans["dates"], *doc.spans["durations"]), entities
):
assert span.text == text[entity.start_char : entity.end_char]
norm = next(m.value for m in entity.modifiers if m.key == "norm")
assert span._.date.norm() == norm
Expand Down Expand Up @@ -257,7 +261,7 @@ def test_false_positives(blank_nlp: Language):
for example in counter_examples:
doc = blank_nlp(example)

assert len(doc.spans["dates"]) == 0
assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0


def test_dates_on_ents_only():
Expand All @@ -280,9 +284,9 @@ def test_dates_on_ents_only():

assert len(doc.ents) == 1

assert len(doc.spans["dates"]) == len(entities)
assert len((*doc.spans["dates"], *doc.spans["durations"])) == len(entities)

for span, entity in zip(doc.spans["dates"], entities):
for span, entity in zip((*doc.spans["dates"], *doc.spans["durations"]), entities):
assert span.text == text[entity.start_char : entity.end_char]


Expand All @@ -293,5 +297,11 @@ def test_illegal_dates(blank_nlp):
)
for text in texts:
doc = blank_nlp(text)
ent = doc.spans["dates"][0]
ent = (*doc.spans["dates"], *doc.spans["durations"])[0]
assert ent._.date.to_datetime() is None


def test_date_label():
nlp = spacy.blank("eds")
nlp.add_pipe("eds.dates", config={"use_date_label": True, "as_ents": True})
assert nlp("Le 31/06/17, la dernière dose.").ents[0].label_ == "date"

0 comments on commit 2544edd

Please sign in to comment.