Skip to content

Commit

Permalink
refacto: split dates into dates and durations & and add bound attribu…
Browse files Browse the repository at this point in the history
…te for period bounds
  • Loading branch information
percevalw committed Aug 8, 2023
1 parent 4fbf4b4 commit 9dcf057
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 135 deletions.
6 changes: 6 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
* `eds.elston-ellis``elston-ellis`
* `eds.SOFA``sofa`
* `eds.adicap``adicap`
- `eds.dates` now separate dates from durations. Each entity has its own label:
* `spans["dates"]` → entities labelled as `date` with a `span._.date` parsed object
* `spans["durations"]` → entities labelled as `duration` with a `span._.duration` parsed object
- the "relative" / "absolute" / "duration" mode of the time entity is now stored in
the `mode` attribute of the `span._.date/duration`
- the "from" / "until" period bound, if any, is now stored in the `span._.date.bound` attribute

### Fixed
- `export_to_brat` issue with spans of entities on multiple lines.
Expand Down
77 changes: 49 additions & 28 deletions edsnlp/pipelines/misc/dates/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from edsnlp.utils.filter import filter_spans

from . import patterns
from .models import AbsoluteDate, Duration, Mode, Period, RelativeDate
from .models import AbsoluteDate, Bound, Duration, Mode, Period, RelativeDate

PERIOD_PROXIMITY_THRESHOLD = 3

Expand Down Expand Up @@ -41,7 +41,7 @@ class Dates(BaseComponent):
false_positive : Union[List[str], str]
List of regular expressions for false positive (eg phone numbers, etc).
on_ents_only : Union[bool, str, Iterable[str]]
Wether to look on dates in the whole document or in specific sentences:
Whether to look on dates in the whole document or in specific sentences:
- If `True`: Only look in the sentences of each entity in doc.ents
- If False: Look in the whole document
Expand Down Expand Up @@ -69,7 +69,6 @@ def __init__(
as_ents: bool,
attr: str,
):

self.nlp = nlp

if absolute is None:
Expand Down Expand Up @@ -122,10 +121,13 @@ def set_extensions(cls) -> None:
if not Span.has_extension("date"):
Span.set_extension("date", default=None)

if not Span.has_extension("duration"):
Span.set_extension("duration", default=None)

if not Span.has_extension("period"):
Span.set_extension("period", default=None)

def process(self, doc: Doc) -> List[Span]:
def process(self, doc: Doc) -> List[Tuple[Span, Dict[str, str]]]:
"""
Find dates in doc.
Expand Down Expand Up @@ -164,33 +166,47 @@ def process(self, doc: Doc) -> List[Span]:

return dates

def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
def parse(
self, matches: List[Tuple[Span, Dict[str, str]]]
) -> Tuple[List[Span], List[Span]]:
"""
Parse dates using the groupdict returned by the matcher.
Parameters
----------
dates : List[Tuple[Span, Dict[str, str]]]
matches : List[Tuple[Span, Dict[str, str]]]
List of tuples containing the spans and groupdict
returned by the matcher.
Returns
-------
List[Span]
Tuple[List[Span], List[Span]]
List of processed spans, with the date parsed.
"""

for span, groupdict in dates:
dates = []
durations = []
for span, groupdict in matches:
if span.label_ == "relative":
parsed = RelativeDate.parse_obj(groupdict)
span.label_ = "date"
span._.date = parsed
dates.append(span)
print("SPAN", span, parsed.dict())
elif span.label_ == "absolute":
parsed = AbsoluteDate.parse_obj(groupdict)
span.label_ = "date"
span._.date = parsed
dates.append(span)
print("SPAN", span, parsed.dict())
else:
parsed = Duration.parse_obj(groupdict)
span.label_ = "duration"
span._.duration = parsed
durations.append(span)
print("SPAN", span, parsed.dict())

span._.date = parsed

return [span for span, _ in dates]
return dates, durations

def process_periods(self, dates: List[Span]) -> List[Span]:
"""
Expand All @@ -216,28 +232,32 @@ def process_periods(self, dates: List[Span]) -> List[Span]:
dates = list(sorted(dates, key=lambda d: d.start))

for d1, d2 in zip(dates[:-1], dates[1:]):

if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
v1 = d1._.date if d1.label_ == "date" else d1._.duration
v2 = d2._.date if d2.label_ == "date" else d2._.duration
if v1.mode == Mode.DURATION or v2.mode == Mode.DURATION:
pass
elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
elif d1 in seen or v1.bound is None or v2.bound is None:
continue

if (
d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
and d1._.date.mode != d2._.date.mode
):

if d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD and v1.bound != v2.bound:
period = Span(d1.doc, d1.start, d2.end, label="period")

# If one date is a duration,
# the other may not have a registered mode.
m1 = d1._.date.mode or Mode.FROM
m2 = d2._.date.mode or Mode.FROM
# the other may not have a registered bound attribute.
if v1.mode == Mode.DURATION:
m1 = Bound.FROM if v2.bound == Bound.UNTIL else Bound.UNTIL
m2 = v2.mode or Bound.FROM
elif v2.mode == Mode.DURATION:
m1 = v1.mode or Bound.FROM
m2 = Bound.FROM if v1.bound == Bound.UNTIL else Bound.UNTIL
else:
m1 = v1.mode or Bound.FROM
m2 = v2.mode or Bound.FROM

period._.period = Period.parse_obj(
{
m1.value: d1,
m2.value: d2,
m1: d1,
m2: d2,
}
)

Expand All @@ -262,17 +282,18 @@ def __call__(self, doc: Doc) -> Doc:
doc : Doc
spaCy Doc object, annotated for dates
"""
dates = self.process(doc)
dates = self.parse(dates)
matches = self.process(doc)
dates, durations = self.parse(matches)

doc.spans["dates"] = dates
doc.spans["durations"] = durations

if self.detect_periods:
doc.spans["periods"] = self.process_periods(dates)
doc.spans["periods"] = self.process_periods(dates + durations)

if self.as_ents:
ents, discarded = filter_spans(
list(doc.ents) + dates, return_discarded=True
list(doc.ents) + dates + durations, return_discarded=True
)

doc.ents = ents
Expand Down
55 changes: 33 additions & 22 deletions edsnlp/pipelines/misc/dates/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@
from edsnlp.pipelines.misc.dates.patterns.relative import specific_dict


class Direction(Enum):
class Direction(str, Enum):
FUTURE = "future"
PAST = "past"
CURRENT = "current"

FUTURE = "FUTURE"
PAST = "PAST"
CURRENT = "CURRENT"

class Bound(str, Enum):
UNTIL = "until"
FROM = "from"

class Mode(Enum):

FROM = "FROM"
UNTIL = "UNTIL"
DURATION = "DURATION"
class Mode(str, Enum):
ABSOLUTE = "absolute"
RELATIVE = "relative"
DURATION = "duration"


class Period(BaseModel):
Expand All @@ -35,7 +38,8 @@ class Config:

class BaseDate(BaseModel):

mode: Optional[Mode] = None
mode: Mode = None
bound: Optional[Bound] = None

@validator("*", pre=True)
def remove_space(cls, v):
Expand All @@ -58,6 +62,7 @@ def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:

class AbsoluteDate(BaseDate):

mode: Mode = Mode.ABSOLUTE
year: Optional[int] = None
month: Optional[int] = None
day: Optional[int] = None
Expand All @@ -77,6 +82,7 @@ def to_datetime(

d = self.dict(exclude_none=True)
d.pop("mode", None)
d.pop("bound", None)
if self.year and self.month and self.day:
try:
return pendulum.datetime(**d, tz=tz)
Expand Down Expand Up @@ -151,6 +157,7 @@ def validate_year(cls, v):

class Relative(BaseDate):

mode: Mode = Mode.RELATIVE
year: Optional[int] = None
month: Optional[int] = None
week: Optional[int] = None
Expand Down Expand Up @@ -184,29 +191,26 @@ def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:

return d

def to_datetime(self, **kwargs) -> pendulum.Duration:

class RelativeDate(Relative):
direction: Direction = Direction.CURRENT

def to_datetime(
self,
note_datetime: Optional[datetime] = None,
**kwargs,
) -> pendulum.Duration:
d = self.dict(exclude_none=True)

direction = d.pop("direction", None)
dir = -1 if direction == Direction.PAST else 1

d.pop("mode", None)
d.pop("bound", None)

d = {f"{k}s": v for k, v in d.items()}

td = dir * pendulum.duration(**d)
return td


class RelativeDate(Relative):
direction: Direction = Direction.CURRENT

def to_datetime(
self,
note_datetime: Optional[datetime] = None,
**kwargs,
) -> pendulum.Duration:
td = super(RelativeDate, self).to_datetime()

if note_datetime is not None and not isinstance(note_datetime, NaTType):
return note_datetime + td
Expand Down Expand Up @@ -264,3 +268,10 @@ def norm(self) -> str:

td = self.to_datetime()
return f"during {td}"

def to_datetime(self, **kwargs) -> pendulum.Duration:
d = self.dict(exclude_none=True)

d = {f"{k}s": v for k, v in d.items() if k not in ("mode", "bound")}

return pendulum.duration(**d)
8 changes: 4 additions & 4 deletions edsnlp/pipelines/misc/dates/patterns/atomic/directions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from edsnlp.utils.regex import make_pattern

preceding_directions = [
r"(?P<direction_PAST>depuis|depuis\s+le|il\s+y\s+a|à)",
r"(?P<direction_FUTURE>dans)",
r"(?P<direction_past>depuis|depuis\s+le|il\s+y\s+a|à)",
r"(?P<direction_future>dans)",
]

following_directions = [
r"(?P<direction_FUTURE>prochaine?s?|suivante?s?|plus\s+tard)",
r"(?P<direction_PAST>derni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)",
r"(?P<direction_future>prochaine?s?|suivante?s?|plus\s+tard)",
r"(?P<direction_past>derni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)",
]

preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True)
Expand Down
4 changes: 2 additions & 2 deletions edsnlp/pipelines/misc/dates/patterns/atomic/modes.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from edsnlp.utils.regex import make_pattern

modes = [
r"(?P<mode_FROM>depuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)",
r"(?P<mode_UNTIL>jusqu'[àa]u?|au)",
r"(?P<bound_from>depuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)",
r"(?P<bound_until>jusqu'[àa]u?|au)",
]

mode_pattern = make_pattern(modes, with_breaks=True)
8 changes: 4 additions & 4 deletions edsnlp/pipelines/misc/dates/patterns/relative.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def make_specific_pattern(mode: str = "forward"):


specific = {
"minus1": (r"hier", dict(direction="PAST", day=1)),
"minus2": (r"avant[-\s]hier", dict(direction="PAST", day=2)),
"plus1": (r"demain", dict(direction="FUTURE", day=1)),
"plus2": (r"après[-\s]demain", dict(direction="FUTURE", day=2)),
"minus1": (r"hier", dict(direction="past", day=1)),
"minus2": (r"avant[-\s]hier", dict(direction="past", day=2)),
"plus1": (r"demain", dict(direction="future", day=1)),
"plus2": (r"après[-\s]demain", dict(direction="future", day=2)),
}

specific_pattern = make_pattern(
Expand Down
30 changes: 12 additions & 18 deletions edsnlp/pipelines/qualifiers/history/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,38 +278,32 @@ def process(self, doc: Doc) -> Doc:
recent_dates = []
if self.dates:
for date in doc.spans["dates"]:
if date.label_ == "relative":
if date._.date.direction.value == "CURRENT":
value = date._.date
if value.mode == "relative":
if value.direction.value == "CURRENT":
if (
(
date._.date.year == 0
and self.history_limit >= timedelta(365)
)
or (
date._.date.month == 0
and self.history_limit >= timedelta(30)
)
(value.year == 0 and self.history_limit >= timedelta(365))
or (
date._.date.week == 0
and self.history_limit >= timedelta(7)
value.month == 0 and self.history_limit >= timedelta(30)
)
or (date._.date.day == 0)
or (value.week == 0 and self.history_limit >= timedelta(7))
or (value.day == 0)
):
recent_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
elif date._.date.direction.value == "PAST":
if -date._.date.to_datetime() >= self.history_limit:
elif value.direction.value == "past":
if -value.to_datetime() >= self.history_limit:
history_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
else:
recent_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
elif date.label_ == "absolute" and doc._.note_datetime:
elif value.mode == "absolute" and doc._.note_datetime:
try:
absolute_date = date._.date.to_datetime(
absolute_date = value.to_datetime(
note_datetime=note_datetime,
infer_from_context=True,
tz="Europe/Paris",
Expand All @@ -321,7 +315,7 @@ def process(self, doc: Doc) -> Doc:
"In doc {}, the following date {} raises this error: {}. "
"Skipping this date.",
doc._.note_id,
date._.date,
value,
e,
)
if absolute_date:
Expand Down
Loading

0 comments on commit 9dcf057

Please sign in to comment.