diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index ca281dbd5..13108923b 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -13,7 +13,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install '.[docs]'
+ pip install '.[dev]'
- name: Set up Git
run: |
git config user.name ${{ github.actor }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 540fd7fc8..6f8dba5fb 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -72,7 +72,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install '.[docs]'
+ pip install '.[dev]'
- name: Set up Git
run: |
git config user.name ${{ github.actor }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 01a2133be..dc677ddfe 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,22 +7,17 @@ on:
jobs:
Linting:
+ if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
- - uses: actions/setup-python@v2
- - name: Set PY variable
- run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
- - uses: actions/cache@v2
+ - uses: actions/checkout@v3
with:
- path: ~/.cache/pre-commit
- key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
- - name: Install pre-commit
- run: |
- pip install pre-commit
- pre-commit install
- - name: Run pre-commit
- run: SKIP=no-commit-to-branch pre-commit run --all-files
+ # requites to grab the history of the PR
+ fetch-depth: 0
+ - uses: actions/setup-python@v3
+ - uses: pre-commit/action@v3.0.0
+ with:
+ extra_args: --color=always --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
Pytest:
runs-on: ubuntu-latest
@@ -79,7 +74,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install '.[docs]'
+ pip install '.[dev]'
- name: Build documentation
run: |
mkdocs build --clean
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 08f632d5b..816f9dfe7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,24 +17,15 @@ repos:
- id: check-toml
- id: check-json
- id: check-symlinks
- - id: check-docstring-first
- id: check-added-large-files
- id: detect-private-key
- - repo: https://github.com/pycqa/isort
- rev: 5.11.5
+ # ruff
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
+ # Ruff version.
+ rev: 'v0.0.288'
hooks:
- - id: isort
- name: isort (python)
- args: ["--profile", "black"]
- - id: isort
- name: isort (cython)
- types: [cython]
- args: ["--profile", "black"]
- - id: isort
- name: isort (pyi)
- types: [pyi]
- args: ["--profile", "black"]
-
+ - id: ruff
+ args: ['--config', 'pyproject.toml']
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
@@ -45,10 +36,6 @@ repos:
- id: blacken-docs
additional_dependencies: [black==20.8b1]
exclude: notebooks/
- - repo: https://github.com/pycqa/flake8
- rev: 4.0.1
- hooks:
- - id: flake8
- repo: https://github.com/econchick/interrogate
rev: 1.5.0
hooks:
diff --git a/Makefile b/Makefile
index 6c1b824f0..11944d17c 100644
--- a/Makefile
+++ b/Makefile
@@ -14,14 +14,14 @@ create-env: .venv
install : .venv
. .venv/bin/activate
- pip install -r '.[dev,docs,setup]'.txt
+ pip install -r '.[dev,setup]'.txt
python scripts/conjugate_verbs.py
pip install -e .
pre-commit install
documentation: .venv
. .venv/bin/activate
- pip install -e '.[docs]'
+ pip install -e '.[dev]'
mkdocs serve
test: .venv
diff --git a/README.md b/README.md
index fdceedae8..0a553df27 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Once you've installed the library, let's begin with a very simple example that e
```python
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
terms = dict(
covid=["covid", "coronavirus"],
diff --git a/changelog.md b/changelog.md
index 321608297..cdc59fcb8 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,6 +1,30 @@
# Changelog
-## Pending
+## Unreleased
+
+### Added
+
+- New `to_duration` method to convert an absolute date into a date relative to the note_datetime (or None)
+
+### Changes
+
+- Input and output of components are now specified by `span_getter` and `span_setter` arguments.
+- :boom: Score / disorders / behaviors entities now have a fixed label (passed as an argument), instead of being dynamically set from the component name. The following scores may have a different name than the current one in your pipelines:
+ * `eds.emergency.gemsa` → `emergency_gemsa`
+ * `eds.emergency.ccmu` → `emergency_ccmu`
+ * `eds.emergency.priority` → `emergency_priority`
+ * `eds.charlson` → `charlson`
+ * `eds.elston_ellis` → `elston_ellis`
+ * `eds.SOFA` → `sofa`
+ * `eds.adicap` → `adicap`
+ * `eds.measuremets` → `size`, `weight`, ... instead of `eds.size`, `eds.weight`, ...
+- `eds.dates` now separate dates from durations. Each entity has its own label:
+ * `spans["dates"]` → entities labelled as `date` with a `span._.date` parsed object
+ * `spans["durations"]` → entities labelled as `duration` with a `span._.duration` parsed object
+- the "relative" / "absolute" / "duration" mode of the time entity is now stored in
+ the `mode` attribute of the `span._.date/duration`
+- the "from" / "until" period bound, if any, is now stored in the `span._.date.bound` attribute
+- `to_datetime` now only return absolute dates, converts relative dates into absolute if `doc._.note_datetime` is given, and None otherwise
### Fixed
- `export_to_brat` issue with spans of entities on multiple lines.
@@ -300,9 +324,7 @@ Fix release to allow installation from source
- Renamed `generic` to `matcher`. This is a non-breaking change for the average user, adding the pipeline is still :
-
-
- ```python
+ ```{ .python .no-check }
nlp.add_pipe("matcher", config=dict(terms=dict(maladie="maladie")))
```
diff --git a/contributing.md b/contributing.md
index 2ca7634f2..aba1d1f0d 100644
--- a/contributing.md
+++ b/contributing.md
@@ -24,7 +24,7 @@ $ python -m venv venv
$ source venv/bin/activate
# Install the package with common, dev, setup dependencies in editable mode
-$ pip install -e '.[dev,docs,setup]'
+$ pip install -e '.[dev,setup]'
# And build resources
$ python scripts/conjugate_verbs.py
```
@@ -113,7 +113,7 @@ We use `MkDocs` for EDS-NLP's documentation. You can checkout the changes you ma
```console
# Install the requirements
-$ pip install -e '.[docs]'
+$ pip install -e '.[dev]'
---> 100%
color:green Installation successful
diff --git a/demo/app.py b/demo/app.py
index 270af89c8..ff7beb9cf 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -4,7 +4,6 @@
import spacy
import streamlit as st
from spacy import displacy
-from spacy.tokens import Span
from edsnlp.utils.filter import filter_spans
@@ -65,22 +64,43 @@
doc.ents
"""
+PIPES = {
+ "Dates": "dates",
+ "Measurements": "measurements",
+ "Charlson": "charlson",
+ "SOFA": "sofa",
+ "Elston & Ellis": "elston_ellis",
+ "TNM": "tnm",
+ "Priority": "emergency_priority",
+ "CCMU": "emergency_ccmu",
+ "GEMSA": "emergency_gemsa",
+ "Covid": "covid",
+ "CIM10": "cim10",
+ "Drugs": "drugs",
+ "Adicap": "adicap",
+ "Diabetes": "diabetes",
+ "Tobacco": "tobacco",
+ "AIDS": "aids",
+ "Lymphoma": "lymphoma",
+ "Leukemia": "leukemia",
+ "Solid Tumor": "solid_tumor",
+ "CKD": "ckd",
+ "Hemiplegia": "hemiplegia",
+ "Liver Disease": "liver_disease",
+ "Peptic Ulcer Disease": "peptic_ulcer_disease",
+ "Connective Tissue Disease": "connective_tissue_disease",
+ "COPD": "copd",
+ "Dementia": "dementia",
+ "Cerebrovascular Accident": "cerebrovascular_accident",
+ "Peripheral Vascular Disease": "peripheral_vascular_disease",
+ "Congestive Heart Failure": "congestive_heart_failure",
+ "Myocardial Infarction": "myocardial_infarction",
+ "Alcohol": "alcohol",
+}
-@st.cache(allow_output_mutation=True)
-def load_model(
- drugs: bool,
- fuzzy_drugs: bool,
- cim10: bool,
- covid: bool,
- dates: bool,
- measurements: bool,
- charlson: bool,
- sofa: bool,
- priority: bool,
- custom_regex: str,
- adicap: bool,
-):
+@st.cache(allow_output_mutation=True)
+def load_model(custom_regex: str, **enabled):
pipes = []
# Declare the pipeline
@@ -88,47 +108,24 @@ def load_model(
nlp.add_pipe("eds.normalizer")
nlp.add_pipe("eds.sentences")
- if drugs:
- if fuzzy_drugs:
- nlp.add_pipe("eds.drugs", config=dict(term_matcher="simstring"))
- pipes.append(
- 'nlp.add_pipe("eds.drugs", config=dict(term_matcher="simstring"))'
- )
- else:
- nlp.add_pipe("eds.drugs")
- pipes.append('nlp.add_pipe("eds.drugs")')
-
- if cim10:
- nlp.add_pipe("eds.cim10")
- pipes.append('nlp.add_pipe("eds.cim10")')
-
- if covid:
- nlp.add_pipe("eds.covid")
- pipes.append('nlp.add_pipe("eds.covid")')
-
- if dates:
- nlp.add_pipe("eds.dates")
- pipes.append('nlp.add_pipe("eds.dates")')
+ for title, name in PIPES.items():
- if measurements:
- nlp.add_pipe("eds.measurements", config={"extract_ranges": True})
- pipes.append('nlp.add_pipe("eds.measurements", config={"extract_ranges": True}')
+ if name == "drugs":
+ if enabled["drugs"]:
+ if enabled["fuzzy_drugs"]:
+ nlp.add_pipe("eds.drugs", config=dict(term_matcher="simstring"))
+ pipes.append(
+ 'nlp.add_pipe("eds.drugs", '
+ 'config=dict(term_matcher="simstring"))'
+ )
+ else:
+ nlp.add_pipe("eds.drugs")
+ pipes.append('nlp.add_pipe("eds.drugs")')
- if charlson:
- nlp.add_pipe("eds.charlson")
- pipes.append('nlp.add_pipe("eds.charlson")')
-
- if sofa:
- nlp.add_pipe("eds.SOFA")
- pipes.append('nlp.add_pipe("eds.SOFA")')
-
- if priority:
- nlp.add_pipe("eds.emergency.priority")
- pipes.append('nlp.add_pipe("eds.emergency.priority")')
-
- if adicap:
- nlp.add_pipe("eds.adicap")
- pipes.append('nlp.add_pipe("eds.adicap")')
+ else:
+ if enabled[name]:
+ nlp.add_pipe(f"eds.{name}")
+ pipes.append(f'nlp.add_pipe("eds.{name}")')
if pipes:
pipes.insert(0, "# Entity extraction pipelines")
@@ -191,19 +188,18 @@ def load_model(
st.sidebar.markdown("The RegEx you defined above is detected under the `custom` label.")
st.sidebar.subheader("Pipeline Components")
-st_cim10 = st.sidebar.checkbox("CIM10 (loading can be slow)", value=False)
+st_pipes = {}
+
+st_pipes["cim10"] = st.sidebar.checkbox("CIM10 (loading can be slow)", value=False)
st_drugs_container = st.sidebar.columns([1, 2])
-st_drugs = st_drugs_container[0].checkbox("Drugs", value=True)
+st_pipes["drugs"] = st_drugs_container[0].checkbox("Drugs", value=True)
st_fuzzy_drugs = st_drugs_container[1].checkbox(
- "Fuzzy drugs search", value=True, disabled=not st_drugs
+ "Fuzzy drugs search", value=True, disabled=not st_pipes["drugs"]
)
-st_covid = st.sidebar.checkbox("COVID", value=True)
-st_dates = st.sidebar.checkbox("Dates", value=True)
-st_measurements = st.sidebar.checkbox("Measurements", value=True)
-st_priority = st.sidebar.checkbox("Emergency Priority Score", value=True)
-st_charlson = st.sidebar.checkbox("Charlson Score", value=True)
-st_sofa = st.sidebar.checkbox("SOFA Score", value=True)
-st_adicap = st.sidebar.checkbox("ADICAP Code", value=True)
+for title, name in PIPES.items():
+ if name == "drugs":
+ continue
+ st_pipes[name] = st.sidebar.checkbox(title, value=True)
st.sidebar.markdown(
"These are just a few of the pipelines provided out-of-the-box by EDS-NLP. "
"See the [documentation](https://aphp.github.io/edsnlp/latest/pipelines/) "
@@ -213,17 +209,9 @@ def load_model(
model_load_state = st.info("Loading model...")
nlp, pipes, regex = load_model(
- drugs=st_drugs,
fuzzy_drugs=st_fuzzy_drugs,
- cim10=st_cim10,
- covid=st_covid,
- dates=st_dates,
- measurements=st_measurements,
- charlson=st_charlson,
- sofa=st_sofa,
- adicap=st_adicap,
- priority=st_priority,
custom_regex=st_custom_regex,
+ **st_pipes,
)
model_load_state.empty()
@@ -236,6 +224,9 @@ def load_model(
)
doc = nlp(text)
+doc.ents = filter_spans(
+ (*doc.ents, *doc.spans.get("dates", []), *doc.spans.get("measurements", []))
+)
st.header("Visualisation")
@@ -244,25 +235,6 @@ def load_model(
"[Export the pipeline section](#export-the-pipeline) for more information)."
)
-ents = list(doc.ents)
-
-for ent in ents:
- if ent._.score_value:
- ent._.value = ent._.score_value
-
-for date in doc.spans.get("dates", []):
- span = Span(doc, date.start, date.end, label="date")
- span._.value = span._.date.norm()
- ents.append(span)
-
-for measure in doc.spans.get("measurements", []):
- span = Span(doc, measure.start, measure.end, label=measure.label_)
- span._.value = span._.value
- ents.append(span)
-
-
-doc.ents = list(filter_spans(ents))
-
category20 = [
"#1f77b4",
"#aec7e8",
@@ -291,11 +263,12 @@ def load_model(
"covid",
"drug",
"cim10",
- "eds.emergency.priority",
- "eds.SOFA",
- "eds.charlson",
- "eds.size",
- "eds.weight",
+ "emergency_priority",
+ "sofa",
+ "charlson",
+ "size",
+ "weight",
+ "adicap",
]
colors = {label: cat for label, cat in zip(labels, category20)}
@@ -314,38 +287,17 @@ def load_model(
data = []
for ent in doc.ents:
-
- if ent.label_ == "date" or "measure" in ent.label_:
- d = dict(
- start=ent.start_char,
- end=ent.end_char,
- lexical_variant=ent.text,
- label=ent.label_,
- negation="",
- family="",
- hypothesis="",
- reported_speech="",
- )
-
- else:
- d = dict(
- start=ent.start_char,
- end=ent.end_char,
- lexical_variant=ent.text,
- label=ent.label_,
- negation="YES" if ent._.negation else "NO",
- family="YES" if ent._.family else "NO",
- hypothesis="YES" if ent._.hypothesis else "NO",
- reported_speech="YES" if ent._.reported_speech else "NO",
- )
-
- try:
- if ent.kb_id_ and not ent._.value:
- d["normalized_value"] = ent.kb_id_
- else:
- d["normalized_value"] = str(ent._.value)
- except TypeError:
- d["normalized_value"] = ""
+ d = dict(
+ start=ent.start_char,
+ end=ent.end_char,
+ text=ent.text,
+ label=ent.label_,
+ normalized_value=ent._.value or "",
+ negation="YES" if ent._.negation else "NO",
+ family="YES" if ent._.family else "NO",
+ hypothesis="YES" if ent._.hypothesis else "NO",
+ reported_speech="YES" if ent._.reported_speech else "NO",
+ )
data.append(d)
diff --git a/docs/advanced-tutorials/fastapi.md b/docs/advanced-tutorials/fastapi.md
index 68cc6a18a..fb6aae3a9 100644
--- a/docs/advanced-tutorials/fastapi.md
+++ b/docs/advanced-tutorials/fastapi.md
@@ -81,9 +81,7 @@ class Document(BaseModel): # (2)
Having defined the output models and the pipeline, we can move on to creating the application itself:
-
-
-```python title="app.py"
+```{ .python .no-check title="app.py" }
from typing import List
from fastapi import FastAPI
@@ -164,9 +162,7 @@ Go to [`localhost:8000/docs`](http://localhost:8000/docs) to admire the automati
You can try the API directly from the documentation. Otherwise, you may use the `requests` package:
-
-
-```python
+```{ .python .no-check }
import requests
notes = [
diff --git a/docs/assets/fragments/aids-examples.md b/docs/assets/fragments/aids-examples.md
new file mode 100644
index 000000000..b705b4953
--- /dev/null
+++ b/docs/assets/fragments/aids-examples.md
@@ -0,0 +1,55 @@
+=== "SIDA"
+ ```python
+ text = "Patient atteint du VIH au stade SIDA."
+ doc = nlp(text)
+ spans = doc.spans["aids"]
+
+ spans
+ # Out: [VIH au stade SIDA]
+ ```
+
+
+
+=== "VIH"
+ ```python
+ text = "Patient atteint du VIH."
+ doc = nlp(text)
+ spans = doc.spans["aids"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "Coinfection"
+ ```python
+ text = "Il y a un VIH avec coinfection pneumocystose"
+ doc = nlp(text)
+ spans = doc.spans["aids"]
+
+ spans
+ # Out: [VIH]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'opportunist': [coinfection, pneumocystose]}
+ ```
+
+
+
+=== "VIH stade SIDA"
+ ```python
+ text = "Présence d'un VIH stade C"
+ doc = nlp(text)
+ spans = doc.spans["aids"]
+
+ spans
+ # Out: [VIH]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'stage': [C]}
+ ```
diff --git a/docs/assets/fragments/alcohol-examples.md b/docs/assets/fragments/alcohol-examples.md
new file mode 100644
index 000000000..3e1609d6c
--- /dev/null
+++ b/docs/assets/fragments/alcohol-examples.md
@@ -0,0 +1,117 @@
+=== "1"
+ ```python
+ text = "Patient alcoolique."
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [alcoolique]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "OH chronique."
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [OH]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Prise d'alcool occasionnelle"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Application d'un pansement alcoolisé"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "Alcoolisme sevré"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [Alcoolisme sevré]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSTINENCE
+
+ span._.assigned
+ # Out: {'stopped': [sevré]}
+ ```
+
+
+
+=== "6"
+ ```python
+ text = "Alcoolisme non sevré"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [Alcoolisme]
+ ```
+
+
+
+=== "7"
+ ```python
+ text = "Alcool: 0"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [Alcool: 0]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSENT
+
+ span._.assigned
+ # Out: {'zero_after': [0]}
+ ```
+
+
+
+=== "8"
+ ```python
+ text = "Le patient est en cours de sevrage éthylotabagique"
+ doc = nlp(text)
+ spans = doc.spans["alcohol"]
+
+ spans
+ # Out: [sevrage éthylotabagique]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSTINENCE
+
+ span._.assigned
+ # Out: {'stopped': [sevrage]}
+ ```
diff --git a/docs/pipelines/ner/disorders/cerebrovascular_accident.md b/docs/assets/fragments/cerebrovascular-accident-examples.md
similarity index 51%
rename from docs/pipelines/ner/disorders/cerebrovascular_accident.md
rename to docs/assets/fragments/cerebrovascular-accident-examples.md
index 65d42ee86..6280ec686 100644
--- a/docs/pipelines/ner/disorders/cerebrovascular_accident.md
+++ b/docs/assets/fragments/cerebrovascular-accident-examples.md
@@ -1,58 +1,3 @@
-# Cerebrovascular accident
-
-The `eds.cerebrovascular_accident` pipeline component extracts mentions of cerebrovascular accident. It will notably match:
-
-- Mentions of AVC/AIT
-- Mentions of bleeding, hemorrhage, thrombus, ischemia, etc., localized in the brain
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/cerebrovascular_accident/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.cerebrovascular_accident")
-```
-
-Below are a few examples:
-
-
-
-
=== "1"
```python
text = "Patient hospitalisé à AVC."
@@ -149,7 +94,3 @@ Below are a few examples:
spans
# Out: [thrombolyse]
```
-
-## Authors and citation
-
-The `eds.cerebrovascular_accident` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/assets/fragments/ckd-examples.md b/docs/assets/fragments/ckd-examples.md
new file mode 100644
index 000000000..469b3054c
--- /dev/null
+++ b/docs/assets/fragments/ckd-examples.md
@@ -0,0 +1,139 @@
+=== "1"
+ ```python
+ text = "Patient atteint d'une glomérulopathie."
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [glomérulopathie]
+ ```
+
+
+=== "2"
+ ```python
+ text = "Patient atteint d'une tubulopathie aigüe."
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: []
+ ```
+
+
+=== "3"
+ ```python
+ text = "Patient transplanté rénal"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [transplanté rénal]
+ ```
+
+
+=== "4"
+ ```python
+ text = "Présence d'une insuffisance rénale aigüe sur chronique"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [insuffisance rénale aigüe sur chronique]
+ ```
+
+
+=== "5"
+ ```python
+ text = "Le patient a été dialysé"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: []
+ ```
+
+
+=== "6"
+ ```python
+ text = "Le patient est dialysé chaque lundi"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [dialysé chaque lundi]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'chronic': [lundi]}
+ ```
+
+
+=== "7"
+ ```python
+ text = "Présence d'une IRC"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: []
+ ```
+
+
+=== "8"
+ ```python
+ text = "Présence d'une IRC sévère"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [IRC sévère]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'status': sévère}
+ ```
+
+
+=== "9"
+ ```python
+ text = "Présence d'une IRC au stade IV"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [IRC au stade IV]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'stage': IV}
+ ```
+
+
+=== "10"
+ ```python
+ text = "Présence d'une IRC avec DFG à 30"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: [IRC avec DFG à 30]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'dfg': 30}
+ ```
+
+
+=== "11"
+ ```python
+ text = "Présence d'une maladie rénale avec DFG à 110"
+ doc = nlp(text)
+ spans = doc.spans["ckd"]
+
+ spans
+ # Out: []
+ ```
diff --git a/docs/assets/fragments/congestive-heart-failure-examples.md b/docs/assets/fragments/congestive-heart-failure-examples.md
new file mode 100644
index 000000000..de26c3000
--- /dev/null
+++ b/docs/assets/fragments/congestive-heart-failure-examples.md
@@ -0,0 +1,50 @@
+
+=== "1"
+ ```python
+ text = "Présence d'un oedème pulmonaire"
+ doc = nlp(text)
+ spans = doc.spans["congestive_heart_failure"]
+
+ spans
+ # Out: [oedème pulmonaire]
+ ```
+
+=== "2"
+ ```python
+ text = "Le patient est équipé d'un pace-maker"
+ doc = nlp(text)
+ spans = doc.spans["congestive_heart_failure"]
+
+ spans
+ # Out: [pace-maker]
+ ```
+
+=== "3"
+ ```python
+ text = "Un cardiopathie non décompensée"
+ doc = nlp(text)
+ spans = doc.spans["congestive_heart_failure"]
+
+ spans
+ # Out: []
+ ```
+
+=== "4"
+ ```python
+ text = "Insuffisance cardiaque"
+ doc = nlp(text)
+ spans = doc.spans["congestive_heart_failure"]
+
+ spans
+ # Out: [Insuffisance cardiaque]
+ ```
+
+=== "5"
+ ```python
+ text = "Insuffisance cardiaque minime"
+ doc = nlp(text)
+ spans = doc.spans["congestive_heart_failure"]
+
+ spans
+ # Out: []
+ ```
diff --git a/docs/assets/fragments/connective-tissue-disease-examples.md b/docs/assets/fragments/connective-tissue-disease-examples.md
new file mode 100644
index 000000000..1ad5a0753
--- /dev/null
+++ b/docs/assets/fragments/connective-tissue-disease-examples.md
@@ -0,0 +1,57 @@
+=== "1"
+ ```python
+ text = "Présence d'une sclérodermie."
+ doc = nlp(text)
+ spans = doc.spans["connective_tissue_disease"]
+
+ spans
+ # Out: [sclérodermie]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Patient atteint d'un lupus."
+ doc = nlp(text)
+ spans = doc.spans["connective_tissue_disease"]
+
+ spans
+ # Out: [lupus]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Présence d'anticoagulants lupiques,"
+ doc = nlp(text)
+ spans = doc.spans["connective_tissue_disease"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Il y a une MICI."
+ doc = nlp(text)
+ spans = doc.spans["connective_tissue_disease"]
+
+ spans
+ # Out: [MICI]
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "Syndrome de Raynaud"
+ doc = nlp(text)
+ spans = doc.spans["connective_tissue_disease"]
+
+ spans
+ # Out: [Raynaud]
+ ```
diff --git a/docs/assets/fragments/copd-examples.md b/docs/assets/fragments/copd-examples.md
new file mode 100644
index 000000000..a2aad8250
--- /dev/null
+++ b/docs/assets/fragments/copd-examples.md
@@ -0,0 +1,74 @@
+=== "1"
+ ```python
+ text = "Une fibrose interstitielle diffuse idiopathique"
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: [fibrose interstitielle diffuse idiopathique]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Patient atteint de pneumoconiose"
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: [pneumoconiose]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Présence d'une HTAP."
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: [HTAP]
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "On voit une hypertension pulmonaire minime"
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "La patiente a été mis sous oxygénorequérance"
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "6"
+ ```python
+ text = "La patiente est sous oxygénorequérance au long cours"
+ doc = nlp(text)
+ spans = doc.spans["copd"]
+
+ spans
+ # Out: [oxygénorequérance au long cours]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'long': [long cours]}
+ ```
diff --git a/docs/assets/fragments/dementia-examples.md b/docs/assets/fragments/dementia-examples.md
new file mode 100644
index 000000000..826e043f3
--- /dev/null
+++ b/docs/assets/fragments/dementia-examples.md
@@ -0,0 +1,45 @@
+=== "1"
+ ```python
+ text = "D'importants déficits cognitifs"
+ doc = nlp(text)
+ spans = doc.spans["dementia"]
+
+ spans
+ # Out: [déficits cognitifs]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Patient atteint de démence"
+ doc = nlp(text)
+ spans = doc.spans["dementia"]
+
+ spans
+ # Out: [démence]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "On retrouve des anti-SLA"
+ doc = nlp(text)
+ spans = doc.spans["dementia"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Une maladie de Charcot"
+ doc = nlp(text)
+ spans = doc.spans["dementia"]
+
+ spans
+ # Out: [maladie de Charcot]
+ ```
diff --git a/docs/assets/fragments/diabetes-examples.md b/docs/assets/fragments/diabetes-examples.md
new file mode 100644
index 000000000..da35b4276
--- /dev/null
+++ b/docs/assets/fragments/diabetes-examples.md
@@ -0,0 +1,102 @@
+=== "1"
+ ```python
+ text = "Présence d'un DT2"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [DT2]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Présence d'un DNID"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [DNID]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Patient diabétique"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [diabétique]
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Un diabète insipide"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "Atteinte neurologique d'origine diabétique"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [origine diabétique]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: WITH_COMPLICATION
+
+ span._.assigned
+ # Out: {'complicated_before': [origine]}
+ ```
+
+
+
+=== "6"
+ ```python
+ text = "Une rétinopathie diabétique"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [rétinopathie diabétique]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: WITH_COMPLICATION
+
+ span._.assigned
+ # Out: {'complicated_before': [rétinopathie]}
+ ```
+
+
+
+=== "7"
+ ```python
+ text = "Il y a un mal perforant plantaire"
+ doc = nlp(text)
+ spans = doc.spans["diabetes"]
+
+ spans
+ # Out: [mal perforant plantaire]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: WITH_COMPLICATION
+ ```
diff --git a/docs/assets/fragments/hemiplegia-examples.md b/docs/assets/fragments/hemiplegia-examples.md
new file mode 100644
index 000000000..fbc5f957c
--- /dev/null
+++ b/docs/assets/fragments/hemiplegia-examples.md
@@ -0,0 +1,33 @@
+=== "1"
+ ```python
+ text = "Patient hémiplégique"
+ doc = nlp(text)
+ spans = doc.spans["hemiplegia"]
+
+ spans
+ # Out: [hémiplégique]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Paralysie des membres inférieurs"
+ doc = nlp(text)
+ spans = doc.spans["hemiplegia"]
+
+ spans
+ # Out: [Paralysie des membres]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Patient en LIS"
+ doc = nlp(text)
+ spans = doc.spans["hemiplegia"]
+
+ spans
+ # Out: [LIS]
+ ```
diff --git a/docs/assets/fragments/leukemia-examples.md b/docs/assets/fragments/leukemia-examples.md
new file mode 100644
index 000000000..f942dcc8f
--- /dev/null
+++ b/docs/assets/fragments/leukemia-examples.md
@@ -0,0 +1,45 @@
+=== "1"
+ ```python
+ text = "Sydrome myéloprolifératif"
+ doc = nlp(text)
+ spans = doc.spans["leukemia"]
+
+ spans
+ # Out: [myéloprolifératif]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Sydrome myéloprolifératif bénin"
+ doc = nlp(text)
+ spans = doc.spans["leukemia"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Patient atteint d'une LAM"
+ doc = nlp(text)
+ spans = doc.spans["leukemia"]
+
+ spans
+ # Out: [LAM]
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Une maladie de Vaquez"
+ doc = nlp(text)
+ spans = doc.spans["leukemia"]
+
+ spans
+ # Out: [Vaquez]
+ ```
diff --git a/docs/assets/fragments/liver-disease-examples.md b/docs/assets/fragments/liver-disease-examples.md
new file mode 100644
index 000000000..2ef6953e2
--- /dev/null
+++ b/docs/assets/fragments/liver-disease-examples.md
@@ -0,0 +1,55 @@
+=== "1"
+ ```python
+ text = "Il y a une fibrose hépatique"
+ doc = nlp(text)
+ spans = doc.spans["liver_disease"]
+
+ spans
+ # Out: [fibrose hépatique]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Une hépatite B chronique"
+ doc = nlp(text)
+ spans = doc.spans["liver_disease"]
+
+ spans
+ # Out: [hépatite B chronique]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Le patient consulte pour une cirrhose"
+ doc = nlp(text)
+ spans = doc.spans["liver_disease"]
+
+ spans
+ # Out: [cirrhose]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: MODERATE_TO_SEVERE
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Greffe hépatique."
+ doc = nlp(text)
+ spans = doc.spans["liver_disease"]
+
+ spans
+ # Out: [Greffe hépatique]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: MODERATE_TO_SEVERE
+ ```
diff --git a/docs/assets/fragments/lymphoma-examples.md b/docs/assets/fragments/lymphoma-examples.md
new file mode 100644
index 000000000..f1c5956a8
--- /dev/null
+++ b/docs/assets/fragments/lymphoma-examples.md
@@ -0,0 +1,45 @@
+=== "1"
+ ```python
+ text = "Un lymphome de Hodgkin."
+ doc = nlp(text)
+ spans = doc.spans["lymphoma"]
+
+ spans
+ # Out: [lymphome de Hodgkin]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Atteint d'un Waldenstörm"
+ doc = nlp(text)
+ spans = doc.spans["lymphoma"]
+
+ spans
+ # Out: [Waldenstörm]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Un LAGC"
+ doc = nlp(text)
+ spans = doc.spans["lymphoma"]
+
+ spans
+ # Out: [LAGC]
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "anti LAGC: 10^4/mL"
+ doc = nlp(text)
+ spans = doc.spans["lymphoma"]
+
+ spans
+ # Out: []
+ ```
diff --git a/docs/assets/fragments/myocardial-infarction-examples.md b/docs/assets/fragments/myocardial-infarction-examples.md
new file mode 100644
index 000000000..a9547d544
--- /dev/null
+++ b/docs/assets/fragments/myocardial-infarction-examples.md
@@ -0,0 +1,67 @@
+=== "1"
+ ```python
+ text = "Une cardiopathie ischémique"
+ doc = nlp(text)
+ spans = doc.spans["myocardial_infarction"]
+
+ spans
+ # Out: [cardiopathie ischémique]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Une cardiopathie non-ischémique"
+ doc = nlp(text)
+ spans = doc.spans["myocardial_infarction"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Présence d'un stent sur la marginale"
+ doc = nlp(text)
+ spans = doc.spans["myocardial_infarction"]
+
+ spans
+ # Out: [stent sur la marginale]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'heart_localized': [marginale]}
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Présence d'un stent périphérique"
+ doc = nlp(text)
+ spans = doc.spans["myocardial_infarction"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "infarctus du myocarde"
+ doc = nlp(text)
+ spans = doc.spans["myocardial_infarction"]
+
+ spans
+ # Out: [infarctus du myocarde]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'heart_localized': [myocarde]}
+ ```
diff --git a/docs/assets/fragments/peptic-ulcer-disease-examples.md b/docs/assets/fragments/peptic-ulcer-disease-examples.md
new file mode 100644
index 000000000..c2a7ac52f
--- /dev/null
+++ b/docs/assets/fragments/peptic-ulcer-disease-examples.md
@@ -0,0 +1,50 @@
+=== "1"
+ ```python
+ text = "Beaucoup d'ulcères gastriques"
+ doc = nlp(text)
+ spans = doc.spans["peptic_ulcer_disease"]
+
+ spans
+ # Out: [ulcères gastriques]
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Présence d'UGD"
+ doc = nlp(text)
+ spans = doc.spans["peptic_ulcer_disease"]
+
+ spans
+ # Out: [UGD]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "La patient à des ulcères"
+ doc = nlp(text)
+ spans = doc.spans["peptic_ulcer_disease"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "Au niveau gastrique: blabla blabla blabla blabla blabla quelques ulcères"
+ doc = nlp(text)
+ spans = doc.spans["peptic_ulcer_disease"]
+
+ spans
+ # Out: [ulcères]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'is_peptic': [gastrique]}
+ ```
diff --git a/docs/pipelines/ner/disorders/peripheral_vascular_disease.md b/docs/assets/fragments/peripheral-vascular-disease-examples.md
similarity index 64%
rename from docs/pipelines/ner/disorders/peripheral_vascular_disease.md
rename to docs/assets/fragments/peripheral-vascular-disease-examples.md
index f0c6f09b7..f31ab8218 100644
--- a/docs/pipelines/ner/disorders/peripheral_vascular_disease.md
+++ b/docs/assets/fragments/peripheral-vascular-disease-examples.md
@@ -1,55 +1,3 @@
-# Peripheral vascular disease
-
-The `eds.peripheral_vascular_disease` pipeline component extracts mentions of peripheral vascular disease.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.peripheral_vascular_disease")
-```
-
-Below are a few examples:
-
-
-
-
=== "1"
```python
text = "Un AOMI"
@@ -208,7 +156,3 @@ Below are a few examples:
spans
# Out: []
```
-
-## Authors and citation
-
-The `eds.peripheral_vascular_disease` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/solid_tumor.md b/docs/assets/fragments/solid-tumor-examples.md
similarity index 50%
rename from docs/pipelines/ner/disorders/solid_tumor.md
rename to docs/assets/fragments/solid-tumor-examples.md
index b7cf2e1da..bd836882a 100644
--- a/docs/pipelines/ner/disorders/solid_tumor.md
+++ b/docs/assets/fragments/solid-tumor-examples.md
@@ -1,59 +1,3 @@
-# Solid tumor
-
-The `eds.solid_tumor` pipeline component extracts mentions of solid tumors. It will notably match:
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/solid_tumor/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to either
- - `"METASTASIS"` for tumors at the metastatic stage
- - `"LOCALIZED"` else
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `stage`: stage of the tumor
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.solid_tumor")
-```
-
-Below are a few examples:
-
-
-
-
=== "1"
```python
text = "Présence d'un carcinome intra-hépatique."
@@ -101,7 +45,7 @@ Below are a few examples:
span = spans[0]
- span._.detailled_status
+ span._.detailed_status
# Out: METASTASIS
span._.assigned
@@ -121,7 +65,7 @@ Below are a few examples:
span = spans[0]
- span._.detailled_status
+ span._.detailed_status
# Out: METASTASIS
span._.assigned
@@ -158,10 +102,6 @@ Below are a few examples:
span = spans[0]
- span._.detailled_status
+ span._.detailed_status
# Out: METASTASIS
```
-
-## Authors and citation
-
-The `eds.solid_tumor` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/assets/fragments/tobacco-examples.md b/docs/assets/fragments/tobacco-examples.md
new file mode 100644
index 000000000..9422c0d3c
--- /dev/null
+++ b/docs/assets/fragments/tobacco-examples.md
@@ -0,0 +1,118 @@
+=== "1"
+ ```python
+ text = "Tabagisme évalué à 15 PA"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [Tabagisme évalué à 15 PA]
+
+ span = spans[0]
+
+ span._.assigned
+ # Out: {'PA': 15}
+ ```
+
+
+
+=== "2"
+ ```python
+ text = "Patient tabagique"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [tabagique]
+ ```
+
+
+
+=== "3"
+ ```python
+ text = "Tabagisme festif"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: []
+ ```
+
+
+
+=== "4"
+ ```python
+ text = "On a un tabagisme ancien"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [tabagisme ancien]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSTINENCE
+
+ span._.assigned
+ # Out: {'stopped': [ancien]}
+ ```
+
+
+
+=== "5"
+ ```python
+ text = "Tabac: 0"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [Tabac: 0]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSENT
+
+ span._.assigned
+ # Out: {'zero_after': [0]}
+ ```
+
+
+
+=== "6"
+ ```python
+ text = "Tabagisme passif"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [Tabagisme passif]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSENT
+
+ span._.assigned
+ # Out: {'secondhand': passif}
+ ```
+
+
+
+=== "7"
+ ```python
+ text = "Tabac: sevré depuis 5 ans"
+ doc = nlp(text)
+ spans = doc.spans["tobacco"]
+
+ spans
+ # Out: [Tabac: sevré]
+
+ span = spans[0]
+
+ span._.detailed_status
+ # Out: ABSTINENCE
+
+ span._.assigned
+ # Out: {'stopped': [sevré]}
+ ```
diff --git a/docs/assets/templates/python/material/class.html b/docs/assets/templates/python/material/class.html
new file mode 100644
index 000000000..ab982e5ae
--- /dev/null
+++ b/docs/assets/templates/python/material/class.html
@@ -0,0 +1,127 @@
+
+{% with html_id = class.path %}
+
+ {% if config.only_parameters or config.only_class_level %}
+
+ {% with docstring_sections = class.docstring.parsed %}
+ {% include "docstring.html" with context %}
+ {% endwith %}
+
+ {% if config.merge_init_into_class %}
+ {% if "__init__" in class.members and class.members["__init__"].has_docstring %}
+ {% with docstring_sections = class.members["__init__"].docstring.parsed %}
+ {% include "docstring.html" with context %}
+ {% endwith %}
+ {% endif %}
+ {% endif %}
+
+ {% else %}
+
+ {% if root %}
+ {% set show_full_path = config.show_root_full_path %}
+ {% set root_members = True %}
+ {% elif root_members %}
+ {% set show_full_path = config.show_root_members_full_path or config.show_object_full_path %}
+ {% set root_members = False %}
+ {% else %}
+ {% set show_full_path = config.show_object_full_path %}
+ {% endif %}
+
+ {% if not root or config.show_root_heading %}
+
+ {% filter heading(heading_level,
+ role="class",
+ id=html_id,
+ class="doc doc-heading",
+ toc_label=class.name) %}
+
+ {% if config.separate_signature %}
+
{% if show_full_path %}{{ class.path }}{% else %}{{ class.name }}{% endif %}
+ {% elif config.merge_init_into_class and "__init__" in class.members -%}
+ {%- with function = class.members["__init__"] -%}
+ {%- filter highlight(language="python", inline=True) -%}
+ {% if show_full_path %}{{ class.path }}{% else %}{{ class.name }}{% endif %}
+ {%- include "signature.html" with context -%}
+ {%- endfilter -%}
+ {%- endwith -%}
+ {% else %}
+
{% if show_full_path %}{{ class.path }}{% else %}{{ class.name }}{% endif %}
+ {% endif %}
+
+ {% with labels = class.labels %}
+ {% include "labels.html" with context %}
+ {% endwith %}
+
+ {% endfilter %}
+
+ {% if config.separate_signature and config.merge_init_into_class %}
+ {% if "__init__" in class.members %}
+ {% with function = class.members["__init__"] %}
+ {% filter format_signature(function, config.line_length, crossrefs=config.signature_crossrefs) %}
+ {% if show_full_path %}{{ class.path }}{% else %}{{ class.name }}{% endif %}
+ {% endfilter %}
+ {% endwith %}
+ {% endif %}
+ {% endif %}
+
+ {% else %}
+ {% if config.show_root_toc_entry %}
+ {% filter heading(heading_level,
+ role="class",
+ id=html_id,
+ toc_label=class.path if config.show_root_full_path else class.name,
+ hidden=True) %}
+ {% endfilter %}
+ {% endif %}
+ {% set heading_level = heading_level - 1 %}
+ {% endif %}
+
+
+ {% if config.show_bases and class.bases %}
+
+ Bases: {% for expression in class.bases -%}
+ {% include "expression.html" with context %}
{% if not loop.last %}, {% endif %}
+ {% endfor -%}
+
+ {% endif %}
+
+ {% with docstring_sections = class.docstring.parsed %}
+ {% with is_merged_init = True %}
+ {% include "docstring.html" with context %}
+ {% endwith %}
+ {% endwith %}
+
+ {% if config.merge_init_into_class %}
+ {% if "__init__" in class.members and class.members["__init__"].has_docstring %}
+ {% with docstring_sections = class.members["__init__"].docstring.parsed, is_merged_init = True %}
+ {% include "docstring.html" with context %}
+ {% endwith %}
+ {% endif %}
+ {% endif %}
+
+ {% if config.show_source %}
+ {% if config.merge_init_into_class %}
+ {% if "__init__" in class.members and class.members["__init__"].source %}
+
+ Source code in {{ class.relative_filepath }}
+ {{ class.members["__init__"].source|highlight(language="python", linestart=class.members["__init__"].lineno, linenums=True) }}
+
+ {% endif %}
+ {% elif class.source %}
+
+ Source code in {{ class.relative_filepath }}
+ {{ class.source|highlight(language="python", linestart=class.lineno, linenums=True) }}
+
+ {% endif %}
+ {% endif %}
+
+ {% with obj = class %}
+ {% set root = False %}
+ {% set heading_level = heading_level + 1 %}
+ {% include "children.html" with context %}
+ {% endwith %}
+
+ {% endif %}
+
+{% endwith %}
+
diff --git a/docs/assets/templates/python/material/docstring.html b/docs/assets/templates/python/material/docstring.html
index bbb788456..4d1ae244a 100644
--- a/docs/assets/templates/python/material/docstring.html
+++ b/docs/assets/templates/python/material/docstring.html
@@ -3,7 +3,7 @@
{% for section in docstring_sections %}
{% if not config.only_parameters %}
{% if section.kind.value == "text" %}
- {{ section.value|convert_markdown(heading_level, html_id) }}
+ {{ section.value|convert_markdown(heading_level - 1, html_id) }}
{% elif section.kind.value == "attributes" %}
{% include "docstring/attributes.html" with context %}
{% elif section.kind.value == "parameters" %}
@@ -27,6 +27,8 @@
{% endif %}
{% elif section.kind.value == "parameters" %}
{% include "docstring/parameters.html" with context %}
+ {% elif section.kind.value == "attributes" %}
+ {% include "docstring/attributes.html" with context %}
{% endif %}
{% endfor %}
{% endif %}
diff --git a/docs/assets/templates/python/material/docstring/examples.html b/docs/assets/templates/python/material/docstring/examples.html
new file mode 100644
index 000000000..394e085ce
--- /dev/null
+++ b/docs/assets/templates/python/material/docstring/examples.html
@@ -0,0 +1,8 @@
+{{ "# Examples\n"|convert_markdown(heading_level, html_id) }}
+{% for section_type, sub_section in section.value %}
+ {% if section_type.value == "text" %}
+ {{ sub_section|convert_markdown(heading_level, html_id) }}
+ {% elif section_type.value == "examples" %}
+ {{ sub_section|convert_markdown(heading_level, html_id) }}
+ {% endif %}
+{% endfor %}
diff --git a/docs/assets/templates/python/material/docstring/parameters.html b/docs/assets/templates/python/material/docstring/parameters.html
index 411de6b56..919d0c82f 100644
--- a/docs/assets/templates/python/material/docstring/parameters.html
+++ b/docs/assets/templates/python/material/docstring/parameters.html
@@ -1,4 +1,5 @@
{{ log.debug("Rendering parameters section") }}
+{{ "# Parameters\n"|convert_markdown(heading_level, html_id) }}
{% if config.docstring_section_style == "table" %}
{% block table_style %}
{{ section.title or "Parameters:" }}
diff --git a/docs/index.md b/docs/index.md
index 5600aebe8..3c883f305 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,7 +33,7 @@ Once you've installed the library, let's begin with a very simple example that e
```python
import spacy
-nlp = spacy.blank("fr") # (1)
+nlp = spacy.blank("eds") # (1)
terms = dict(
covid=["covid", "coronavirus"], # (2)
@@ -67,60 +67,7 @@ This example is complete, it should run as-is. Check out the [spaCy 101 page](tu
## Available pipeline components
-=== "Core"
-
- | Pipeline | Description |
- | ------------------------ | ----------------------------------------------- |
- | `eds.normalizer` | Non-destructive input text normalisation |
- | `eds.sentences` | Better sentence boundary detection |
- | `eds.matcher` | A simple yet powerful entity extractor |
- | `eds.terminology` | A simple yet powerful terminology matcher |
- | `eds.contextual-matcher` | A conditional entity extractor |
- | `eds.endlines` | An unsupervised model to classify each end line |
-
-=== "Qualifiers"
-
- | Pipeline | Description |
- | --------------------- | ------------------------------------ |
- | `eds.negation` | Rule-based negation detection |
- | `eds.family` | Rule-based family context detection |
- | `eds.hypothesis` | Rule-based speculation detection |
- | `eds.reported_speech` | Rule-based reported speech detection |
- | `eds.history` | Rule-based medical history detection |
-
-=== "Miscellaneous"
-
- | Pipeline | Description |
- | ------------------------ | ------------------------------------------- |
- | `eds.dates` | Date extraction and normalisation |
- | `eds.consultation_dates` | Identify consultation dates |
- | `eds.measurements` | Measure extraction and normalisation |
- | `eds.sections` | Section detection |
- | `eds.reason` | Rule-based hospitalisation reason detection |
- | `eds.tables` | Tables detection |
-
-=== "NER"
-
- | Pipeline | Description |
- | ------------------------ | --------------------------- |
- | `eds.covid` | A COVID mentions detector |
- | `eds.charlson` | A Charlson score extractor |
- | `eds.sofa` | A SOFA score extractor |
- | `eds.emergency.priority` | A priority score extractor |
- | `eds.emergency.ccmu` | A CCMU score extractor |
- | `eds.emergency.gemsa` | A GEMSA score extractor |
- | `eds.TNM` | A TNM score extractor |
- | `eds.cim10` | A CIM10 terminology matcher |
- | `eds.drugs` | A Drug mentions extractor |
- | `eds.adicap` | A ADICAP codes extractor |
- | `eds.umls` | A UMLS terminology matcher |
-
-=== "Trainable"
-
- | Pipeline | Description |
- | -------------------- | -------------------------------------------------------------------- |
- | `eds.nested-ner` | Nested and overlapping named entity recogntion |
- | `eds.span-qualifier` | A trainable component for multi-class multi-label span qualification |
+--8<-- "docs/pipelines/overview.md:components"
## Disclaimer
@@ -142,5 +89,3 @@ If you use EDS-NLP, please cite us as below.
url = {http://aphp.github.io/edsnlp}
}
```
-
-\bibliography
diff --git a/docs/pipelines/core/contextual-matcher.md b/docs/pipelines/core/contextual-matcher.md
index 3d8c6f72d..439ca5489 100644
--- a/docs/pipelines/core/contextual-matcher.md
+++ b/docs/pipelines/core/contextual-matcher.md
@@ -21,7 +21,7 @@ Let us see step by step how to build such a list using the example stated just a
To do this, we can build either a set of `terms` or a set of `regex`. `terms` will be used to search for exact matches in the text. While less flexible,
it is faster than using regex. In our case we could use the following lists (which are of course absolutely not exhaustives):
-```python3
+```python
terms = [
"cancer",
"tumeur",
@@ -36,7 +36,7 @@ regex = [
Maybe we want to exclude mentions of benign cancers:
-```python3
+```python
benign = "benign|benin"
```
@@ -44,7 +44,7 @@ benign = "benign|benin"
For this we will forge a RegEx with one capturing group (basically a pattern enclosed in parentheses):
-```python3
+```python
stage = "stade (I{1,3}V?|[1234])"
```
@@ -52,7 +52,7 @@ This will extract stage between 1 and 4
We can add a second regex to try to capture if the cancer is in a metastasis stage or not:
-```python3
+```python
metastase = "(metasta)"
```
@@ -60,8 +60,7 @@ metastase = "(metasta)"
We can now put everything together:
-```python3
-
+```python
cancer = dict(
source="Cancer solide",
regex=regex,
@@ -75,8 +74,8 @@ cancer = dict(
dict(
name="stage",
regex=stage,
- window=(-10,10),
- replace_entity=True,
+ window=(-10, 10),
+ replace_entity=False,
reduce_mode=None,
),
dict(
@@ -86,7 +85,7 @@ cancer = dict(
replace_entity=False,
reduce_mode="keep_last",
),
- ]
+ ],
)
```
@@ -108,7 +107,7 @@ lymphome = dict(
In this case, the configuration can be concatenated in a list:
-```python3
+```python
patterns = [cancer, lymphome]
```
@@ -143,12 +142,12 @@ This parameter can be se to `True` **only for a single assign key per dictionary
**Please notice** that with `replace_entity` set to True, if the correponding assign key matches nothing, the entity will be discarded.
-## Usage
+## Examples
```python
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("sentences")
nlp.add_pipe("normalizer")
@@ -158,6 +157,7 @@ nlp.add_pipe(
name="Cancer",
config=dict(
patterns=patterns,
+ label="cancer",
),
)
```
@@ -167,15 +167,13 @@ Let us see what we can get from this pipeline with a few examples
=== "Simple match"
-
-
- ```python3
+ ```python
txt = "Le patient a eu un cancer il y a 5 ans"
doc = nlp(txt)
ent = doc.ents[0]
ent.label_
- # Out: Cancer
+ # Out: cancer
ent._.source
# Out: Cancer solide
@@ -188,9 +186,7 @@ Let us see what we can get from this pipeline with a few examples
Let us check that when a *benign* mention is present, the extraction is excluded:
-
-
- ```python3
+ ```python
txt = "Le patient a eu un cancer relativement bénin il y a 5 ans"
doc = nlp(txt)
@@ -200,12 +196,10 @@ Let us see what we can get from this pipeline with a few examples
=== "Extracting additional infos"
-
-
All informations extracted from the provided `assign` configuration can be found in the `assigned` attribute
under the form of a dictionary:
- ```python3
+ ```python
txt = "Le patient a eu un cancer de stade 3."
doc = nlp(txt)
@@ -213,14 +207,6 @@ Let us see what we can get from this pipeline with a few examples
# Out: {'stage': '3'}
```
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-::: edsnlp.pipelines.core.contextual_matcher.factory.create_component
- options:
- only_parameters: true
-
However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries**
## The pattern dictionary
@@ -293,7 +279,7 @@ A patterr is a nested dictionary with the following keys:
### A full pattern dictionary example
-```python3
+```python
dict(
source="AVC",
regex=[
@@ -336,10 +322,13 @@ dict(
expand_entity=False,
window=-3,
),
- ]
+ ],
)
```
+::: edsnlp.pipelines.core.contextual_matcher.factory.create_component
+ options:
+ only_parameters: true
## Authors and citation
diff --git a/docs/pipelines/core/endlines.md b/docs/pipelines/core/endlines.md
index 965b05e5f..3845cd0e9 100644
--- a/docs/pipelines/core/endlines.md
+++ b/docs/pipelines/core/endlines.md
@@ -1,95 +1,8 @@
-# Endlines
-
-The `eds.endlines` pipeline classifies newline characters as actual end of lines or mere spaces. In the latter case, the token is removed from the normalised document.
-
-Behind the scenes, it uses a `endlinesmodel` instance, which is an unsupervised algorithm based on the work of Zweigenbaum et al[@zweigenbaum2016].
-
-## Usage
-
-The following example shows a simple usage.
-
-### Training
-
-```python
-import spacy
-from edsnlp.pipelines.core.endlines.endlinesmodel import EndLinesModel
-
-nlp = spacy.blank("fr")
-
-texts = [
- """Le patient est arrivé hier soir.
-Il est accompagné par son fils
-
-ANTECEDENTS
-Il a fait une TS en 2010;
-Fumeur, il est arreté il a 5 mois
-Chirurgie de coeur en 2011
-CONCLUSION
-Il doit prendre
-le medicament indiqué 3 fois par jour. Revoir médecin
-dans 1 mois.
-DIAGNOSTIC :
-
-Antecedents Familiaux:
-- 1. Père avec diabete
-
-""",
- """J'aime le \nfromage...\n""",
-]
-
-docs = list(nlp.pipe(texts))
-
-# Train and predict an EndLinesModel
-endlines = EndLinesModel(nlp=nlp)
-
-df = endlines.fit_and_predict(docs)
-df.head()
-
-PATH = "/tmp/path_to_save"
-endlines.save(PATH)
-```
-
-### Inference
-
-```python
-import spacy
-from spacy.tokens import Span
-from spacy import displacy
-
-nlp = spacy.blank("fr")
-
-PATH = "/tmp/path_to_save"
-nlp.add_pipe("eds.endlines", config=dict(model_path=PATH))
-
-docs = list(nlp.pipe(texts))
-
-doc_exemple = docs[1]
-
-doc_exemple.ents = tuple(
- Span(doc_exemple, token.i, token.i + 1, "excluded")
- for token in doc_exemple
- if token.tag_ == "EXCLUDED"
-)
-
-displacy.render(doc_exemple, style="ent", options={"colors": {"space": "red"}})
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| ------------ | -------------------------------- | -------- |
-| `model_path` | Path to the pre-trained pipeline | Required |
-
-## Declared extensions
-
-The `eds.endlines` pipeline declares one [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects. The `end_line` attribute is a boolean, set to `True` if the pipeline predicts that the new line is an end line character. Otherwise, it is set to `False` if the new line is classified as a space.
-
-The pipeline also sets the `excluded` custom attribute on newlines that are classified as spaces. It lets downstream matchers skip excluded tokens (see [normalisation](./normalisation.md)) for more detail.
-
-## Authors and citation
-
-The `eds.endlines` pipeline was developed by AP-HP's Data Science team based on the work of Zweigenbaum et al[@zweigenbaum2016].
-
-\bibliography
+# Endlines {: #edsnlp.pipelines.core.endlines.factory.create_component }
+
+::: edsnlp.pipelines.core.endlines.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/core/index.md b/docs/pipelines/core/index.md
deleted file mode 100644
index ceae7f535..000000000
--- a/docs/pipelines/core/index.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Core Pipelines
-
-This section deals with "core" functionalities offered by EDS-NLP:
-
-- Matching a terminology
-- Normalising a text
-- Detecting sentence boundaries
diff --git a/docs/pipelines/core/matcher.md b/docs/pipelines/core/matcher.md
index cb08afbb2..984541ec4 100644
--- a/docs/pipelines/core/matcher.md
+++ b/docs/pipelines/core/matcher.md
@@ -1,54 +1,8 @@
-# Matcher
-
-EDS-NLP simplifies the matching process by exposing a `eds.matcher` pipeline
-that can match on terms or regular expressions.
-
-## Usage
-
-Let us redefine the pipeline :
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-
-terms = dict(
- covid=["coronavirus", "covid19"], # (1)
- patient="patient", # (2)
-)
-
-regex = dict(
- covid=r"coronavirus|covid[-\s]?19|sars[-\s]cov[-\s]2", # (3)
-)
-
-nlp.add_pipe(
- "eds.matcher",
- config=dict(
- terms=terms,
- regex=regex,
- attr="LOWER",
- term_matcher="exact",
- term_matcher_config={},
- ),
-)
-```
-
-1. Every key in the `terms` dictionary is mapped to a concept.
-2. The `eds.matcher` pipeline expects a list of expressions, or a single expression.
-3. We can also define regular expression patterns.
-
-This snippet is complete, and should run as is.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# Matcher {: #edsnlp.pipelines.core.matcher.factory.create_component }
::: edsnlp.pipelines.core.matcher.factory.create_component
options:
- only_parameters: true
-
-Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the label of the extracted entities. Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).
-
-## Authors and citation
-
-The `eds.matcher` pipeline was developed by AP-HP's Data Science team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/core/normalisation.md b/docs/pipelines/core/normalizer.md
similarity index 97%
rename from docs/pipelines/core/normalisation.md
rename to docs/pipelines/core/normalizer.md
index bd7c0ebfd..5c6b7223c 100644
--- a/docs/pipelines/core/normalisation.md
+++ b/docs/pipelines/core/normalizer.md
@@ -1,10 +1,8 @@
-# Normalisation
+# Normalisation {: #edsnlp.pipelines.core.normalizer.factory.create_component }
The normalisation scheme used by EDS-NLP adheres to the non-destructive doctrine. In other words,
-
-
-```python
+```{ .python .no-check }
nlp(text).text == text
```
@@ -39,7 +37,7 @@ The normalisation is handled by the single `eds.normalizer` pipeline. The follow
import spacy
from edsnlp.matchers.utils import get_text
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer")
# Notice the special character used for the apostrophe and the quotes
@@ -87,7 +85,7 @@ config = dict(
pollution=False,
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer", config=config)
text = "Pneumopathie à NBNbWbWbNbWbNBNbNbWbW `coronavirus'"
@@ -118,7 +116,7 @@ config = dict(
pollution=False,
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer", config=config)
text = "Pneumopathie à NBNbWbWbNbWbNBNbNbWbW `coronavirus'"
@@ -148,7 +146,7 @@ config = dict(
pollution=False,
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer", config=config)
text = "Pneumopathie à NBNbWbWbNbWbNBNbNbWbW `coronavirus'"
@@ -207,7 +205,7 @@ config = dict(
pollution=True,
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer", config=config)
text = "Pneumopathie à NBNbWbWbNbWbNBNbNbWbW `coronavirus'"
@@ -235,7 +233,7 @@ For instance, if we consider biology tables as pollution, we only need to instan
```python
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe(
"eds.normalizer",
config=dict(
@@ -264,7 +262,7 @@ For instance, to consider text between "AAA" and "ZZZ" as pollution you might us
```python
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe(
"eds.normalizer",
config=dict(
diff --git a/docs/pipelines/core/overview.md b/docs/pipelines/core/overview.md
new file mode 100644
index 000000000..e9d71307e
--- /dev/null
+++ b/docs/pipelines/core/overview.md
@@ -0,0 +1,22 @@
+# Core Components
+
+This section deals with "core" functionalities offered by EDS-NLP:
+
+- Generic matchers against regular expressions and list of terms
+- Text cleaning
+- Sentence boundaries detection
+
+## Available components
+
+
+
+| Component | Description |
+|----------------------------------------------------------------|-------------------------------------------------|
+| [`eds.normalizer`](/pipelines/core/normalizer) | Non-destructive input text normalisation |
+| [`eds.sentences`](/pipelines/core/sentences) | Better sentence boundary detection |
+| [`eds.matcher`](/pipelines/core/matcher) | A simple yet powerful entity extractor |
+| [`eds.terminology`](/pipelines/core/terminology) | A simple yet powerful terminology matcher |
+| [`eds.contextual_matcher`](/pipelines/core/contextual-matcher) | A conditional entity extractor |
+| [`eds.endlines`](/pipelines/core/endlines) | An unsupervised model to classify each end line |
+
+
diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md
index 9bd34a166..0d1cfde54 100644
--- a/docs/pipelines/core/sentences.md
+++ b/docs/pipelines/core/sentences.md
@@ -1,71 +1,8 @@
-# Sentences
-
-The `eds.sentences` pipeline provides an alternative to spaCy's default `sentencizer`, aiming to overcome some of its limitations.
-
-Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances.
-
-Moreover, the `eds.sentences` pipeline can use the output of the `eds.normalizer` pipeline, and more specifically the end-of-line classification. This is activated by default.
-
-## Usage
-
-=== "EDS-NLP"
-
-
-
- ```python
- import spacy
-
- nlp = spacy.blank("fr")
- nlp.add_pipe("eds.sentences")
-
- text = (
- "Le patient est admis le 23 août 2021 pour une douleur à l'estomac\n"
- "Il lui était arrivé la même chose il y a deux ans."
- )
-
- doc = nlp(text)
-
- for sentence in doc.sents:
- print("", sentence, "")
- # Out: Le patient est admis le 23 août 2021 pour une douleur à l'estomac
- # Out: <\s>
- # Out: Il lui était arrivé la même chose il y a deux ans. <\s>
- ```
-
-=== "spaCy sentencizer"
-
-
-
- ```python
- import spacy
-
- nlp = spacy.blank("fr")
- nlp.add_pipe("sentencizer")
-
- text = (
- "Le patient est admis le 23 août 2021 pour une douleur à l'estomac\n"
- "Il lui était arrivé la même chose il y a deux ans."
- )
-
- doc = nlp(text)
-
- for sentence in doc.sents:
- print("", sentence, "")
- # Out: Le patient est admis le 23 août 2021 pour une douleur à l'estomac
- # Out: Il lui était arrivé la même chose il y a deux ans. <\s>
- ```
-
-Notice how EDS-NLP's implementation is more robust to ill-defined sentence endings.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------- | ----------------------------------------------------------------------- | --------------------------------- |
-| `punct_chars` | Punctuation patterns | `None` (use pre-defined patterns) |
-| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True` |
-
-## Authors and citation
-
-The `eds.sentences` pipeline was developed by AP-HP's Data Science team.
+# Sentences {: #edsnlp.pipelines.core.sentences.factory.create_component }
+
+::: edsnlp.pipelines.core.sentences.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/core/terminology.md b/docs/pipelines/core/terminology.md
index 9b14d1d30..ea13058cb 100644
--- a/docs/pipelines/core/terminology.md
+++ b/docs/pipelines/core/terminology.md
@@ -1,67 +1,8 @@
-# Terminology
-
-EDS-NLP simplifies the terminology matching process by exposing a `eds.terminology` pipeline
-that can match on terms or regular expressions.
-
-The terminology matcher is very similar to the [generic matcher](matcher.md), although the use case differs slightly.
-The generic matcher is designed to extract any entity, while the terminology matcher is specifically tailored
-towards high volume terminologies.
-
-There are some key differences:
-
-1. It labels every matched entity to the same value, provided to the pipeline
-2. The keys provided in the `regex` and `terms` dictionaries are used as the `kb_id_` of the entity,
- which handles fine-grained labelling
-
-For instance, a terminology matcher could detect every drug mention under the top-level label `drug`,
-and link each individual mention to a given drug through its `kb_id_` attribute.
-
-## Usage
-
-Let us redefine the pipeline :
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-
-terms = dict(
- covid=["coronavirus", "covid19"], # (1)
- flu=["grippe saisonnière"], # (2)
-)
-
-regex = dict(
- covid=r"coronavirus|covid[-\s]?19|sars[-\s]cov[-\s]2", # (3)
-)
-
-nlp.add_pipe(
- "eds.terminology",
- config=dict(
- label="disease",
- terms=terms,
- regex=regex,
- attr="LOWER",
- ),
-)
-```
-
-1. Every key in the `terms` dictionary is mapped to a concept.
-2. The `eds.matcher` pipeline expects a list of expressions, or a single expression.
-3. We can also define regular expression patterns.
-
-This snippet is complete, and should run as is.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# Terminology {: #edsnlp.pipelines.core.terminology.factory.create_component }
::: edsnlp.pipelines.core.terminology.factory.create_component
options:
- only_parameters: true
-
-Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities.
-Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).
-
-## Authors and citation
-
-The `eds.terminology` pipeline was developed by AP-HP's Data Science team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/index.md b/docs/pipelines/index.md
index 28269efcb..c15d22e6d 100644
--- a/docs/pipelines/index.md
+++ b/docs/pipelines/index.md
@@ -1,6 +1,6 @@
# Pipelines overview
-EDS-NLP provides easy-to-use spaCy components.
+EDS-NLP provides easy-to-use pipeline components.
=== "Core"
@@ -15,58 +15,19 @@ EDS-NLP provides easy-to-use spaCy components.
=== "Qualifiers"
- | Pipeline | Description |
- | --------------------- | ------------------------------------ |
- | `eds.negation` | Rule-based negation detection |
- | `eds.family` | Rule-based family context detection |
- | `eds.hypothesis` | Rule-based speculation detection |
- | `eds.reported_speech` | Rule-based reported speech detection |
- | `eds.history` | Rule-based medical history detection |
+ See the [Qualifier overview](/pipelines/qualifiers/overview/) for more information.
+
+ --8<-- "docs/pipelines/qualifiers/overview.md:components"
=== "Miscellaneous"
- | Pipeline | Description |
- | ------------------------ | ------------------------------------------- |
- | `eds.dates` | Date extraction and normalisation |
- | `eds.consultation_dates` | Identify consultation dates |
- | `eds.measurements` | Measure extraction and normalisation |
- | `eds.sections` | Section detection |
- | `eds.reason` | Rule-based hospitalisation reason detection |
- | `eds.tables` | Tables detection |
+ --8<-- "docs/pipelines/misc/overview.md:components"
=== "NER"
- | Pipeline | Description |
- | --------------------------------- | ------------------------------------- |
- | `eds.covid` | A COVID mentions detector |
- | `eds.charlson` | A Charlson score extractor |
- | `eds.elstonellis` | An Elston & Ellis code extractor |
- | `eds.emergency.priority` | A priority score extractor |
- | `eds.emergency.ccmu` | A CCMU score extractor |
- | `eds.emergency.gemsa` | A GEMSA score extractor |
- | `eds.sofa` | A SOFA score extractor |
- | `eds.TNM` | A TNM score extractor |
- | `eds.adicap` | A ADICAP codes extractor |
- | `eds.drugs` | A drug mentions extractor |
- | `eds.cim10` | A CIM10 terminology matcher |
- | `eds.umls` | An UMLS terminology matcher |
- | `eds.CKD` | CKD extractor |
- | `eds.COPD` | COPD extractor |
- | `eds.cerebrovascular_accident` | Cerebrovascular accident extractor |
- | `eds.congestive_heart_failure` | Congestive heart failure extractor |
- | `eds.connective_tissue_disease` | Connective tissue disease extractor |
- | `eds.dementia` | Dementia extractor |
- | `eds.diabetes` | Diabetes extractor |
- | `eds.hemiplegia` | Hemiplegia extractor |
- | `eds.leukemia` | Leukemia extractor |
- | `eds.liver_disease` | Liver disease extractor |
- | `eds.lymphoma` | Lymphoma extractor |
- | `eds.myocardial_infarction` | Myocardial infarction extractor |
- | `eds.peptic_ulcer_disease` | Peptic ulcer disease extractor |
- | `eds.peripheral_vascular_disease` | Peripheral vascular disease extractor |
- | `eds.solid_tumor` | Solid tumor extractor |
- | `eds.alcohol` | Alcohol consumption extractor |
- | `eds.tobacco` | Tobacco consumption extractor |
+ See the [NER overview](/pipelines/ner/overview/) for more information.
+
+ --8<-- "docs/pipelines/ner/overview.md:components"
=== "Trainable"
@@ -75,11 +36,13 @@ EDS-NLP provides easy-to-use spaCy components.
| `eds.nested-ner` | A trainable component for nested (and classic) NER |
| `eds.span-qualifier` | A trainable component for multi-class multi-label span qualification |
-You can add them to your spaCy pipeline by simply calling `add_pipe`, for instance:
-
-
+You can add them to your pipeline by simply calling `add_pipe`, for instance:
```python
-# ↑ Omitted code that defines the nlp object ↑
+import spacy
+
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.normalizer")
+nlp.add_pipe("eds.sentences")
+nlp.add_pipe("eds.tnm")
```
diff --git a/docs/pipelines/misc/consultation-dates.md b/docs/pipelines/misc/consultation-dates.md
index ac4a87f0b..b91b0af76 100644
--- a/docs/pipelines/misc/consultation-dates.md
+++ b/docs/pipelines/misc/consultation-dates.md
@@ -1,80 +1,8 @@
-# Consultation Dates
-
-This pipeline consists of two main parts:
-
-- A **matcher** which finds mentions of _consultation events_ (more details below)
-- A **date parser** (see the corresponding pipeline) that links a date to those events
-
-## Usage
-
-!!! note
-
- It is designed to work **ONLY on consultation notes** (`CR-CONS`), so please filter accordingly before proceeding.
-
-```python
-import spacy
-
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- lowercase=True,
- accents=True,
- quotes=True,
- pollution=False,
- ),
-)
-nlp.add_pipe("eds.consultation_dates")
-
-text = "XXX \n" "Objet : Compte-Rendu de Consultation du 03/10/2018. \n" "XXX "
-
-doc = nlp(text)
-
-doc.spans["consultation_dates"]
-# Out: [Consultation du 03/10/2018]
-
-doc.spans["consultation_dates"][0]._.consultation_date.to_datetime()
-# Out: DateTime(2018, 10, 3, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
-```
-
-## Consultation events
-
-Three main families of terms are available by default to extract those events.
-
-### The `consultation_mention` terms
-
-This list contains terms directly referring to consultations, such as "_Consultation du..._" or "_Compte rendu du..._".
-This list is the only one activated by default since it is fairly precise an not error-prone.
-
-### The `town_mention` terms
-
-This list contains the towns of each AP-HP's hospital. Its goal is to fetch dates mentioned as "_Paris, le 13 décembre 2015_". It has a high recall but poor precision, since those dates can often be dates of letter redaction instead of consultation dates.
-
-### The `document_date_mention` terms
-
-This list contains expressions mentioning the date of creation/edition of a document, such as "_Date du rapport: 13/12/2015_" or "_Signé le 13/12/2015_". As for `town_mention`, it has a high recall but is prone to errors since document date and consultation date aren't necessary similar.
-
-!!! note
-
- By default, only the `consultation_mention` are used
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| ----------------------- | ---------------------------------------------------------- | --------------------------------- |
-| `consultation_mention` | Whether to use consultation patterns, or list of patterns | `True` (use pre-defined patterns) |
-| `town_mention` | Whether to use town patterns, or list of patterns | `False` |
-| `document_date_mention` | Whether to use document date patterns, or list of patterns | `False` |
-| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` |
-
-## Declared extensions
-
-The `eds.consultation_dates` pipeline declares one [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes) on the `Span` object: the `consultation_date` attribute, which is a Python `datetime` object.
-
-## Authors and citation
-
-The `eds.consultation_dates` pipeline was developed by AP-HP's Data Science team.
+# Consultation dates {: #edsnlp.pipelines.misc.consultation_dates.factory.create_component }
+
+::: edsnlp.pipelines.misc.consultation_dates.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/misc/dates.md b/docs/pipelines/misc/dates.md
index 09555a1e0..f5febd440 100644
--- a/docs/pipelines/misc/dates.md
+++ b/docs/pipelines/misc/dates.md
@@ -1,83 +1,8 @@
-# Dates
-
-The `eds.dates` pipeline's role is to detect and normalise dates within a medical document.
-We use simple regular expressions to extract date mentions.
-
-## Scope
-
-The `eds.dates` pipeline finds absolute (eg `23/08/2021`) and relative (eg `hier`, `la semaine dernière`) dates alike. It also handles mentions of duration.
-
-| Type | Example |
-| ---------- | ----------------------------- |
-| `absolute` | `3 mai`, `03/05/2020` |
-| `relative` | `hier`, `la semaine dernière` |
-| `duration` | `pendant quatre jours` |
-
-See the [tutorial](../../tutorials/detecting-dates.md) for a presentation of a full pipeline featuring the `eds.dates` component.
-
-## Usage
-
-```python
-import spacy
-
-import pendulum
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.dates")
-
-text = (
- "Le patient est admis le 23 août 2021 pour une douleur à l'estomac. "
- "Il lui était arrivé la même chose il y a un an pendant une semaine. "
- "Il a été diagnostiqué en mai 1995."
-)
-
-doc = nlp(text)
-
-dates = doc.spans["dates"]
-dates
-# Out: [23 août 2021, il y a un an, pendant une semaine, mai 1995]
-
-dates[0]._.date.to_datetime()
-# Out: 2021-08-23T00:00:00+02:00
-
-dates[1]._.date.to_datetime()
-# Out: -1 year
-
-note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris")
-
-dates[1]._.date.to_datetime(note_datetime=note_datetime)
-# Out: DateTime(2020, 8, 27, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
-
-date_3_output = dates[3]._.date.to_datetime(
- note_datetime=note_datetime,
- infer_from_context=True,
- tz="Europe/Paris",
- default_day=15,
-)
-date_3_output
-# Out: DateTime(1995, 5, 15, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
-```
-
-## Declared extensions
-
-The `eds.dates` pipeline declares one [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes) on the `Span` object: the `date` attribute contains a parsed version of the date.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-|------------------|--------------------------------------------------|-----------------------------------|
-| `absolute` | Absolute date patterns, eg `le 5 août 2020` | `None` (use pre-defined patterns) |
-| `relative` | Relative date patterns, eg `hier`) | `None` (use pre-defined patterns) |
-| `durations` | Duration patterns, eg `pendant trois mois`) | `None` (use pre-defined patterns) |
-| `false_positive` | Some false positive patterns to exclude | `None` (use pre-defined patterns) |
-| `detect_periods` | Whether to look for periods | `False` |
-| `detect_time` | Whether to look for time around dates | `True` |
-| `on_ents_only` | Whether to look for dates around entities only | `False` |
-| `as_ents` | Whether to save detected dates as entities | `False` |
-| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` |
-
-## Authors and citation
-
-The `eds.dates` pipeline was developed by AP-HP's Data Science team.
+# Dates {: #edsnlp.pipelines.misc.dates.factory.create_component }
+
+::: edsnlp.pipelines.misc.dates.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/misc/index.md b/docs/pipelines/misc/index.md
deleted file mode 100644
index 74caf0cfd..000000000
--- a/docs/pipelines/misc/index.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Miscellaneous
-
-This section regroups pipelines that extract information that can be used by other components, but have little medical value in itself.
-
-For instance, the date detection and normalisation pipeline falls in this category.
diff --git a/docs/pipelines/misc/measurements.md b/docs/pipelines/misc/measurements.md
index 23e462409..a334b4078 100644
--- a/docs/pipelines/misc/measurements.md
+++ b/docs/pipelines/misc/measurements.md
@@ -1,151 +1,8 @@
-# Measurements
-
-The `eds.measurements` pipeline's role is to detect and normalise numerical measurements within a medical document.
-We use simple regular expressions to extract and normalize measurements, and use `Measurement` classes to store them.
-
-!!! warning
-
- The ``measurements`` pipeline is still in active development and has not been rigorously validated.
- If you come across a measurement expression that goes undetected, please file an issue !
-
-## Scope
-
-The `eds.measurements` pipeline can extract simple (eg `3cm`) measurements.
-It can detect elliptic enumerations (eg `32, 33 et 34kg`) of measurements of the same type and split the measurements accordingly.
-
-The normalized value can then be accessed via the `span._.value` attribute and converted on the fly to a desired unit.
-
-The current pipeline annotates the following measurements out of the box:
-
-| Measurement name | Example |
-|------------------|------------------------|
-| `eds.size` | `1m50`, `1.50m` |
-| `eds.weight` | `12kg`, `1kg300` |
-| `eds.bmi` | `BMI: 24`, `24 kg.m-2` |
-| `eds.volume` | `2 cac`, `8ml` |
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe(
- "eds.measurements",
- config=dict(
- measurements=["eds.size", "eds.weight", "eds.bmi"],
- extract_ranges=True,
- ),
-)
-
-text = """
-Le patient est admis hier, fait 1m78 pour 76kg.
-Les deux nodules bénins sont larges de 1,2 et 2.4mm.
-BMI: 24.
-
-Le nodule fait entre 1 et 1.5 cm
-"""
-
-doc = nlp(text)
-
-measurements = doc.spans["measurements"]
-
-measurements
-# Out: [1m78, 76kg, 1,2, 2.4mm, 24, entre 1 et 1.5 cm]
-
-measurements[0]
-# Out: 1m78
-
-str(measurements[0]._.value)
-# Out: '1.78 m'
-
-measurements[0]._.value.cm
-# Out: 178.0
-
-measurements[2]
-# Out: 1,2
-
-str(measurements[2]._.value)
-# Out: '1.2 mm'
-
-str(measurements[2]._.value.mm)
-# Out: 1.2
-
-measurements[4]
-# Out: 24
-
-str(measurements[4]._.value)
-# Out: '24 kg_per_m2'
-
-str(measurements[4]._.value.kg_per_m2)
-# Out: 24
-
-str(measurements[5]._.value)
-# Out: 1-1.5 cm
-```
-
-To extract all sizes in centimeters, and average range measurements, you can use the following snippet:
-
-```python
-sizes = [
- sum(item.cm for item in m._.value) / len(m._.value)
- for m in doc.spans["measurements"]
- if m.label_ == "eds.size"
-]
-print(sizes)
-sizes
-# Out: [178.0, 0.12, 0.24, 1.25]
-```
-
-## Custom measurement
-
-You can declare custom measurements by changing the patterns
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe(
- "eds.measurements",
- config=dict(
- measurements={
- # this name will be used to define the labels of the matched entities
- "my_custom_surface_measurement": {
- # This measurement unit is homogenous to square meters
- "unit": "m2",
- # To handle cases like "surface: 1.8" (implied m2), we can use
- # unitless patterns
- "unitless_patterns": [
- {
- "terms": ["surface", "aire"],
- "ranges": [
- {
- "unit": "m2",
- "min": 0,
- "max": 9,
- }
- ],
- }
- ],
- },
- }
- ),
-)
-```
-
-## Declared extensions
-
-The `eds.measurements` pipeline declares a single [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes) on the `Span` object,
-the `value` attribute that is a `Measurement` instance.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# Measurements {: #edsnlp.pipelines.misc.measurements.factory.create_component }
::: edsnlp.pipelines.misc.measurements.factory.create_component
options:
- only_parameters: true
-
-## Authors and citation
-
-The `eds.measurements` pipeline was developed by AP-HP's Data Science team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/misc/overview.md b/docs/pipelines/misc/overview.md
new file mode 100644
index 000000000..9af00378c
--- /dev/null
+++ b/docs/pipelines/misc/overview.md
@@ -0,0 +1,20 @@
+# Miscellaneous
+
+This section regroups components that extract information that can be used by other components, but have little medical value in itself.
+
+For instance, the date detection and normalisation pipeline falls in this category.
+
+## Available components
+
+
+
+| Component | Description |
+|----------------------------------------------------------------|---------------------------------------------|
+| [`eds.dates`](/pipelines/misc/dates) | Date extraction and normalisation |
+| [`eds.consultation_dates`](/pipelines/misc/consultation-dates) | Identify consultation dates |
+| [`eds.measurements`](/pipelines/misc/measurements) | Measure extraction and normalisation |
+| [`eds.sections`](/pipelines/misc/sections) | Section detection |
+| [`eds.reason`](/pipelines/misc/reason) | Rule-based hospitalisation reason detection |
+| [`eds.tables`](/pipelines/misc/tables) | Tables detection |
+
+
diff --git a/docs/pipelines/misc/reason.md b/docs/pipelines/misc/reason.md
index 343673eaf..3d3d0c90e 100644
--- a/docs/pipelines/misc/reason.md
+++ b/docs/pipelines/misc/reason.md
@@ -1,82 +1,8 @@
-# Reason
-
-The `eds.reason` pipeline uses a rule-based algorithm to detect spans that relate to the reason of the hospitalisation. It was designed at AP-HP's EDS.
-
-## Usage
-
-The following snippet matches a simple terminology, and looks for spans of hospitalisation reasons. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-text = """COMPTE RENDU D'HOSPITALISATION du 11/07/2018 au 12/07/2018
-MOTIF D'HOSPITALISATION
-Monsieur Dupont Jean Michel, de sexe masculin, âgée de 39 ans, née le 23/11/1978, a été
-hospitalisé du 11/08/2019 au 17/08/2019 pour attaque d'asthme.
-
-ANTÉCÉDENTS
-Antécédents médicaux :
-Premier épisode d'asthme en mai 2018."""
-
-nlp = spacy.blank("fr")
-
-# Extraction of entities
-nlp.add_pipe(
- "eds.matcher",
- config=dict(
- terms=dict(
- respiratoire=[
- "asthmatique",
- "asthme",
- "toux",
- ]
- )
- ),
-)
-
-
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.reason", config=dict(use_sections=True))
-doc = nlp(text)
-
-reason = doc.spans["reasons"][0]
-reason
-# Out: hospitalisé du 11/08/2019 au 17/08/2019 pour attaque d'asthme.
-
-reason._.is_reason
-# Out: True
-
-entities = reason._.ents_reason
-entities
-# Out: [asthme]
-
-entities[0].label_
-# Out: 'respiratoire'
-
-ent = entities[0]
-ent._.is_reason
-# Out: True
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| ----------------- | ------------------------------------------------ | --------------------------------- |
-| `reasons` | Reasons patterns | `None` (use pre-defined patterns) |
-| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` |
-| `use_sections` | Whether to use sections | `False` |
-| `ignore_excluded` | Whether to ignore excluded tokens | `False` |
-
-## Declared extensions
-
-The `eds.reason` pipeline adds the key `reasons` to `doc.spans` and declares one [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on the `Span` objects called `ents_reason`.
-
-The `ents_reason` extension is a list of named entities that overlap the `Span`, typically entities found in previous pipelines like `matcher`.
-
-It also declares the boolean extension `is_reason`. This extension is set to True for the Reason Spans but also for the entities that overlap the reason span.
-
-## Authors and citation
-
-The `eds.reason` pipeline was developed by AP-HP's Data Science team.
+# Reasons {: #edsnlp.pipelines.misc.reason.factory.create_component }
+
+::: edsnlp.pipelines.misc.reason.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/misc/sections.md b/docs/pipelines/misc/sections.md
index b53916af2..8e9598a18 100644
--- a/docs/pipelines/misc/sections.md
+++ b/docs/pipelines/misc/sections.md
@@ -1,109 +1,8 @@
-# Sections
-
-Detected sections are :
-
-- `allergies`
-- `antécédents`
-- `antécédents familiaux`
-- `traitements entrée`
-- `conclusion`
-- `conclusion entrée`
-- `habitus`
-- `correspondants`
-- `diagnostic`
-- `données biométriques entrée`
-- `examens`
-- `examens complémentaires`
-- `facteurs de risques`
-- `histoire de la maladie`
-- `actes`
-- `motif`
-- `prescriptions`
-- `traitements sortie`
-- `evolution`
-- `modalites sortie`
-- `vaccinations`
-- `introduction`
-
-
- |
-
-
-
-The pipeline extracts section title. A "section" is then defined as the span of text between two titles.
-
-Remarks :
-- section `introduction` corresponds to the span of text between the header "COMPTE RENDU D'HOSPITALISATION" (usually denoting the beginning of the document) and the title of the following detected section
-- this pipeline works well for hospitalization summaries (CRH), but not necessarily for all types of documents (in particular for emergency or scan summaries CR-IMAGERIE)
-
-!!! warning "Use at your own risks"
-
- Should you rely on `eds.sections` for critical downstream tasks, make sure to validate the pipeline to make sure that the component works.
- For instance, the `eds.history` pipeline can use sections to make its predictions, but that possibility is deactivated by default.
-
-## Usage
-
-The following snippet detects section titles. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.sections")
-
-text = "CRU du 10/09/2021\n" "Motif :\n" "Patient admis pour suspicion de COVID"
-
-doc = nlp(text)
-
-doc.spans["section_titles"]
-# Out: [Motif]
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| ----------------- | ------------------------------------------------ | --------------------------------- |
-| `sections` | Sections patterns | `None` (use pre-defined patterns) |
-| `add_patterns` | Whether add endlines patterns | `True` |
-| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"NORM"` |
-| `ignore_excluded` | Whether to ignore excluded tokens | `True` |
-
-## Declared extensions
-
-The `eds.sections` pipeline adds two fields to the `doc.spans` attribute :
-
-1. The `section_titles` key contains the list of all section titles extracted using the list declared in the `terms.py` module.
-2. The `sections` key contains a list of sections, ie spans of text between two section titles (or the last title and the end of the document).
-
-If the document has entities before calling this pipeline an attribute `section` is added to each entity.
-
-## Authors and citation
-
-The `eds.sections` pipeline was developed by AP-HP's Data Science team.
+# Sections {: #edsnlp.pipelines.misc.sections.factory.create_component }
+
+::: edsnlp.pipelines.misc.sections.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/misc/tables.md b/docs/pipelines/misc/tables.md
index b4d6966b0..c24c86d55 100644
--- a/docs/pipelines/misc/tables.md
+++ b/docs/pipelines/misc/tables.md
@@ -1,96 +1,8 @@
-# Tables
-
-The `eds.tables` pipeline's role is to detect tables present in a medical document.
-We use simple regular expressions to extract tables like text.
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.tables")
-
-text = """
-SERVICE
-MEDECINE INTENSIVE –
-REANIMATION
-Réanimation / Surveillance Continue
-Médicale
-
-COMPTE RENDU D'HOSPITALISATION du 05/06/2020 au 10/06/2020
-Madame DUPONT Marie, née le 16/05/1900, âgée de 20 ans, a été hospitalisée en réanimation du
-05/06/1920 au 10/06/1920 pour intoxication médicamenteuse volontaire.
-
-
-Examens complémentaires
-Hématologie
-Numération
-Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
-Hématies ¦x10*12/L¦4.68 ¦4.53-5.79
-Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7
-Hématocrite ¦% ¦44.2 ¦39.2-48.6
-VGM ¦fL ¦94.4 + ¦79.6-94
-TCMH ¦pg ¦31.6 ¦27.3-32.8
-CCMH ¦g/dL ¦33.5 ¦32.4-36.3
-Plaquettes ¦x10*9/L ¦191 ¦172-398
-VMP ¦fL ¦11.5 + ¦7.4-10.8
-
-Sur le plan neurologique : Devant la persistance d'une confusion à distance de l'intoxication au
-...
-
-2/2Pat : |F | | |Intitulé RCP
-
-"""
-
-doc = nlp(text)
-
-# A table span
-table = doc.spans["tables"][0]
-# Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
-# Hématies ¦x10*12/L¦4.68 ¦4.53-5.79
-# Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7
-# Hématocrite ¦% ¦44.2 ¦39.2-48.6
-# VGM ¦fL ¦94.4 + ¦79.6-94
-# TCMH ¦pg ¦31.6 ¦27.3-32.8
-# CCMH ¦g/dL ¦33.5 ¦32.4-36.3
-# Plaquettes ¦x10*9/L ¦191 ¦172-398
-# VMP ¦fL ¦11.5 + ¦7.4-10.8
-
-# Convert span to Pandas table
-df = table._.to_pd_table()
-type(df)
-# >> pandas.core.frame.DataFrame
-```
-The pd DataFrame:
-| | 0 | 1 | 2 | 3 |
-| ---: | :---------- | :------- | :----- | :-------- |
-| 0 | Leucocytes | x10*9/L | 4.97 | 4.09-11 |
-| 1 | Hématies | x10*12/L | 4.68 | 4.53-5.79 |
-| 2 | Hémoglobine | g/dL | 14.8 | 13.4-16.7 |
-| 3 | Hématocrite | % | 44.2 | 39.2-48.6 |
-| 4 | VGM | fL | 94.4 + | 79.6-94 |
-| 5 | TCMH | pg | 31.6 | 27.3-32.8 |
-| 6 | CCMH | g/dL | 33.5 | 32.4-36.3 |
-| 7 | Plaquettes | x10*9/L | 191 | 172-398 |
-| 8 | VMP | fL | 11.5 + | 7.4-10.8 |
-
-## Declared extensions
-
-The `eds.tables` pipeline declares one [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes) on the `Span` object: the `to_pd_table()` method returns a parsed pandas version of the table.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| ----------------- | ------------------------------------------------ | ---------------------- |
-| `tables_pattern` | Pattern to identify table spans | `rf"(\b.*{sep}.*\n)+"` |
-| `sep_pattern` | Pattern to identify column separation | `r"¦"` |
-| `ignore_excluded` | Ignore excluded tokens | `True` |
-| `attr` | spaCy attribute to match on, eg `NORM` or `TEXT` | `"TEXT"` |
-
-## Authors and citation
-
-The `eds.tables` pipeline was developed by AP-HP's Data Science team.
+# Tables {: #edsnlp.pipelines.misc.tables.factory.create_component }
+
+::: edsnlp.pipelines.misc.tables.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/adicap.md b/docs/pipelines/ner/adicap.md
index afc2bae26..730df6127 100644
--- a/docs/pipelines/ner/adicap.md
+++ b/docs/pipelines/ner/adicap.md
@@ -1,106 +1,8 @@
-# ADICAP
-
-The `eds.adicap` pipeline component matches the ADICAP codes. It was developped to work on anapathology reports.
-
-!!! warning "Document type"
-
- It was developped to work on anapathology reports.
-
- We recommend also to use the `eds` language (`spacy.blank("eds")`)
-
-The compulsory characters of the ADICAP code are identified and decoded.
-These characters represent the following attributes:
-
-
-
-| Field [en] | Field [fr] | Attribute |
-|-----------------------|----------------------------------|-----------------------|
-| Sampling mode | Mode de prelevement | sampling_mode |
-| Technic | Type de technique | technic |
-| Organ and regions | Appareils, organes et régions | organ |
-| Pathology | Pathologie générale | pathology |
-| Pathology type | Type de la pathologie | pathology_type |
-| Behaviour type | Type de comportement | behaviour_type |
-
-
-The pathology field takes 4 different values corresponding to the 4 possible interpretations of the ADICAP code, which are : "PATHOLOGIE GÉNÉRALE NON TUMORALE", "PATHOLOGIE TUMORALE", "PATHOLOGIE PARTICULIERE DES ORGANES" and "CYTOPATHOLOGIE".
-
-Depending on the pathology value the behaviour type meaning changes, when the pathology is tumoral then it describes the malignancy of the tumor.
-
-For further details about the ADICAP code follow this [link](https://smt.esante.gouv.fr/wp-json/ans/terminologies/document?terminologyId=terminologie-adicap&fileName=cgts_sem_adicap_fiche-detaillee.pdf).
-
-## Usage
-
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe("eds.adicap")
-
-text = """"
-COMPTE RENDU D’EXAMEN
-
-Antériorité(s) : NEANT
-
-
-Renseignements cliniques :
-Contexte d'exploration d'un carcinome canalaire infiltrant du quadrant supéro-externe du sein droit. La
-lésion biopsiée ce jour est située à 5,5 cm de la lésion du quadrant supéro-externe, à l'union des
-quadrants inférieurs.
-
-
-Macrobiopsie 10G sur une zone de prise de contraste focale à l'union des quadrants inférieurs du
-sein droit, mesurant 4 mm, classée ACR4
-
-14 fragments ont été communiqués fixés en formol (lame n° 1a et lame n° 1b) . Il n'y a pas eu
-d'échantillon congelé. Ces fragments ont été inclus en paraffine en totalité et coupés sur plusieurs
-niveaux.
-Histologiquement, il s'agit d'un parenchyme mammaire fibroadipeux parfois légèrement dystrophique
-avec quelques petits kystes. Il n'y a pas d'hyperplasie épithéliale, pas d'atypie, pas de prolifération
-tumorale. On note quelques suffusions hémorragiques focales.
-
-Conclusion :
-Légers remaniements dystrophiques à l'union des quadrants inférieurs du sein droit.
-Absence d'atypies ou de prolifération tumorale.
-
-Codification : BHGS0040
-"""
-
-doc = nlp(text)
-
-doc.ents
-# Out: (BHGS0040,)
-
-ent = doc.ents[0]
-
-ent.label_
-# Out: adicap
-
-ent._.adicap.dict()
-# Out: {'code': 'BHGS0040',
-# 'sampling_mode': 'BIOPSIE CHIRURGICALE',
-# 'technic': 'HISTOLOGIE ET CYTOLOGIE PAR INCLUSION',
-# 'organ': "SEIN (ÉGALEMENT UTILISÉ CHEZ L'HOMME)",
-# 'pathology': 'PATHOLOGIE GÉNÉRALE NON TUMORALE',
-# 'pathology_type': 'ETAT SUBNORMAL - LESION MINEURE',
-# 'behaviour_type': 'CARACTERES GENERAUX'}
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# Adicap {: #edsnlp.pipelines.ner.adicap.factory.create_component }
::: edsnlp.pipelines.ner.adicap.factory.create_component
options:
- only_parameters: true
-
-
-## Authors and citation
-
-The `eds.adicap` pipeline was developed by AP-HP's Data Science team.
-The codes were downloaded from the website of 'Agence du numérique en santé' [@terminologie-adicap] ("Thésaurus de la codification ADICAP - Index raisonné des lésions")
-
-\bibliography
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/behaviors/alcohol.md b/docs/pipelines/ner/behaviors/alcohol.md
index 622b8e764..a3e0792f4 100644
--- a/docs/pipelines/ner/behaviors/alcohol.md
+++ b/docs/pipelines/ner/behaviors/alcohol.md
@@ -1,176 +1,7 @@
-# Alcohol consumption
+# Alcohol consumption {: #edsnlp.pipelines.ner.behaviors.alcohol.factory.create_component }
-The `eds.alcohol` pipeline component extracts mentions of alcohol consumption. It won't match occasionnal consumption, nor acute intoxication.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/behaviors/alcohol/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to either
- - `"PRESENT"`
- - `"ABSTINENCE"` if the patient stopped its consumption
- - `"ABSENT"` if the patient has no alcohol dependence
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.alcohol")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Patient alcoolique."
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [alcoolique]
- ```
-
-
-
-=== "2"
- ```python
- text = "OH chronique."
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [OH]
- ```
-
-
-
-=== "3"
- ```python
- text = "Prise d'alcool occasionnelle"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "Application d'un pansement alcoolisé"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "5"
- ```python
- text = "Alcoolisme sevré"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [Alcoolisme sevré]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSTINENCE
-
- span._.assigned
- # Out: {'stopped': [sevré]}
- ```
-
-
-
-=== "6"
- ```python
- text = "Alcoolisme non sevré"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [Alcoolisme]
- ```
-
-
-
-=== "7"
- ```python
- text = "Alcool: 0"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [Alcool: 0]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSENT
-
- span._.assigned
- # Out: {'zero_after': [0]}
- ```
-
-
-
-=== "8"
- ```python
- text = "Le patient est en cours de sevrage éthylotabagique"
- doc = nlp(text)
- spans = doc.spans["alcohol"]
-
- spans
- # Out: [sevrage éthylotabagique]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSTINENCE
-
- span._.assigned
- # Out: {'stopped': [sevrage]}
- ```
-
-## Authors and citation
-
-The `eds.alcohol` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+::: edsnlp.pipelines.ner.behaviors.alcohol.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
diff --git a/docs/pipelines/ner/behaviors/index.md b/docs/pipelines/ner/behaviors/overview.md
similarity index 77%
rename from docs/pipelines/ner/behaviors/index.md
rename to docs/pipelines/ner/behaviors/overview.md
index 1f2b28e2d..4d372d38e 100644
--- a/docs/pipelines/ner/behaviors/index.md
+++ b/docs/pipelines/ner/behaviors/overview.md
@@ -2,16 +2,15 @@
## Presentation
-At the moment, EDS-NLP exposes two pipelines extracting behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component.
+EDS-NLP offers two components to extract behavioral patterns, namely the tobacco and alcohol consumption status. Each component is based on the ContextualMatcher component.
Some general considerations about those components:
-- Extracted entities are stored in the `doc.spans` dictionary. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `0`, `1`, or `2`. A corresponding `_.detailled_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
+- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
+- The matched comorbidity is also available under the `ent.label_` of each match.
+- Matches have an associated `_.status` attribute taking the value `0`, `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-
- ```python
+ ```{ .python .no-check }
nlp.add_pipe(
"eds.normalizer",
config=dict(
@@ -32,7 +31,7 @@ Some general considerations about those components:
)
```
-- Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developped and trained specificaly for those components. For privacy reason, the model isn't publicly available yet.
+- Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
!!! aphp "Use the ML model"
@@ -40,9 +39,7 @@ Some general considerations about those components:
## Usage
-
-
-```python
+```{ .python .no-check }
import spacy
nlp = spacy.blank("eds")
@@ -87,16 +84,16 @@ doc.spans
# 'diabetes': [rétinopathie diabétique, diabète]
# }
-tobacco = doc.spans["tobacco"]
-tobacco[0]._.detailled_status
+tobacco_matches = doc.spans["tobacco"]
+tobacco_matches[0]._.detailed_status
# Out: "ABSTINENCE" #
-tobacco[0]._.assigned["PA"] # paquet-année
+tobacco_matches[0]._.assigned["PA"] # paquet-année
# Out: 10 # (1)
diabetes = doc.spans["diabetes"]
-(diabetes[0]._.detailled_status, diabetes[1]._.detailled_status)
+(diabetes[0]._.detailed_status, diabetes[1]._.detailed_status)
# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2)
```
diff --git a/docs/pipelines/ner/behaviors/tobacco.md b/docs/pipelines/ner/behaviors/tobacco.md
index c52c4a352..ca6383cab 100644
--- a/docs/pipelines/ner/behaviors/tobacco.md
+++ b/docs/pipelines/ner/behaviors/tobacco.md
@@ -1,180 +1,7 @@
-# Tobacco consumption
+# Tobacco consumption {: #edsnlp.pipelines.ner.behaviors.tobacco.factory.create_component }
-The `eds.tobacco` pipeline component extracts mentions of tobacco consumption.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/behaviors/tobacco/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to either
- - `"PRESENT"`
- - `"ABSTINENCE"` if the patient stopped its consumption
- - `"ABSENT"` if the patient has no tobacco dependence
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `PA`: the mentionned *year-pack* (= *paquet-année*)
- - `secondhand`: if secondhand smoking
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.tobacco")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Tabagisme évalué à 15 PA"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [Tabagisme évalué à 15 PA]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'PA': 15}
- ```
-
-
-
-=== "2"
- ```python
- text = "Patient tabagique"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [tabagique]
- ```
-
-
-
-=== "3"
- ```python
- text = "Tabagisme festif"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "On a un tabagisme ancien"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [tabagisme ancien]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSTINENCE
-
- span._.assigned
- # Out: {'stopped': [ancien]}
- ```
-
-
-
-=== "5"
- ```python
- text = "Tabac: 0"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [Tabac: 0]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSENT
-
- span._.assigned
- # Out: {'zero_after': [0]}
- ```
-
-
-
-=== "6"
- ```python
- text = "Tabagisme passif"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [Tabagisme passif]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSENT
-
- span._.assigned
- # Out: {'secondhand': passif}
- ```
-
-
-
-=== "7"
- ```python
- text = "Tabac: sevré depuis 5 ans"
- doc = nlp(text)
- spans = doc.spans["tobacco"]
-
- spans
- # Out: [Tabac: sevré]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: ABSTINENCE
-
- span._.assigned
- # Out: {'stopped': [sevré]}
- ```
-
-## Authors and citation
-
-The `eds.tobacco` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+::: edsnlp.pipelines.ner.behaviors.tobacco.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
diff --git a/docs/pipelines/ner/cim10.md b/docs/pipelines/ner/cim10.md
index cb294e692..42312c6e7 100644
--- a/docs/pipelines/ner/cim10.md
+++ b/docs/pipelines/ner/cim10.md
@@ -1,44 +1,8 @@
-# CIM10
-
-The `eds.cim10` pipeline component matches the CIM10 (French-language ICD) terminology.
-
-!!! warning "Very low recall"
-
- When using the `exact' matching mode, this component has a very poor recall performance.
- We can use the `simstring` mode to retrieve approximate matches, albeit at the cost of a significantly higher computation time.
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.cim10", config=dict(term_matcher="simstring"))
-
-text = "Le patient est suivi pour fièvres typhoïde et paratyphoïde."
-
-doc = nlp(text)
-
-doc.ents
-# Out: (fièvres typhoïde et paratyphoïde,)
-
-ent = doc.ents[0]
-
-ent.label_
-# Out: cim10
-
-ent.kb_id_
-# Out: A01
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# CIM10 {: #edsnlp.pipelines.ner.cim10.factory.create_component }
::: edsnlp.pipelines.ner.cim10.factory.create_component
options:
- only_parameters: true
-
-## Authors and citation
-
-The `eds.cim10` pipeline was developed by AP-HP's Data Science team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/covid.md b/docs/pipelines/ner/covid.md
index 226a5e0ef..28d37c034 100644
--- a/docs/pipelines/ner/covid.md
+++ b/docs/pipelines/ner/covid.md
@@ -1,31 +1,8 @@
-# COVID
-
-The `eds.covid` pipeline component detects mentions of COVID19 and adds them to `doc.ents`.
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.covid")
-
-text = "Le patient est admis pour une infection au coronavirus."
-
-doc = nlp(text)
-
-doc.ents
-# Out: (infection au coronavirus,)
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# COVID {: #edsnlp.pipelines.ner.covid.factory.create_component }
::: edsnlp.pipelines.ner.covid.factory.create_component
options:
- only_parameters: true
-
-## Authors and citation
-
-The `eds.covid` pipeline was developed by AP-HP's Data Science team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/AIDS.md b/docs/pipelines/ner/disorders/AIDS.md
deleted file mode 100644
index 1ce0a82a1..000000000
--- a/docs/pipelines/ner/disorders/AIDS.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# AIDS
-
-The `eds.AIDS` pipeline component extracts mentions of AIDS. It will notably match:
-
-- Mentions of VIH/HIV at the SIDA/AIDS stage
-- Mentions of VIH/HIV with opportunistic(s) infection(s)
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/AIDS/patterns.py"
- # fmt: on
- ```
-
-!!! warning "On HIV infection"
-
- pre-AIDS HIV infection are not extracted, only AIDS.
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `opportunist`: list of opportunist infections extracted around the HIV mention
- - `stage`: stage of the HIV infection
-
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.AIDS")
-```
-
-Below are a few examples:
-
-=== "SIDA"
- ```python
- text = "Patient atteint du VIH au stade SIDA."
- doc = nlp(text)
- spans = doc.spans["AIDS"]
-
- spans
- # Out: [VIH au stade SIDA]
- ```
-
-
-
-=== "VIH"
- ```python
- text = "Patient atteint du VIH."
- doc = nlp(text)
- spans = doc.spans["AIDS"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "Coinfection"
- ```python
- text = "Il y a un VIH avec coinfection pneumocystose"
- doc = nlp(text)
- spans = doc.spans["AIDS"]
-
- spans
- # Out: [VIH]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'opportunist': [coinfection, pneumocystose]}
- ```
-
-
-
-=== "VIH stade SIDA"
- ```python
- text = "Présence d'un VIH stade C"
- doc = nlp(text)
- spans = doc.spans["AIDS"]
-
- spans
- # Out: [VIH]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'stage': [C]}
- ```
-
-## Authors and citation
-
-The `eds.AIDS` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/CKD.md b/docs/pipelines/ner/disorders/CKD.md
deleted file mode 100644
index 583a2c36a..000000000
--- a/docs/pipelines/ner/disorders/CKD.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# CKD
-
-The `eds.CKD` pipeline component extracts mentions of CKD (Chronic Kidney Disease). It will notably match:
-
-- Mentions of various diseases (see below)
-- Kidney transplantation
-- Chronic dialysis
-- Renal failure **from stage 3 to 5**. The stage is extracted by trying 3 methods:
- - Extracting the mentionned stage directly ("*IRC stade IV*")
- - Extracting the severity directly ("*IRC terminale*")
- - Extracting the mentionned GFR (DFG in french) ("*IRC avec DFG estimé à 30 mL/min/1,73m2)*")
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/CKD/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `stage`: mentionned renal failure stage
- - `status`: mentionned renal failure severity (e.g. modérée, sévère, terminale, etc.)
- - `dfg`: mentionned DFG
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.CKD")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Patient atteint d'une glomérulopathie."
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [glomérulopathie]
- ```
-
-
-
-=== "2"
- ```python
- text = "Patient atteint d'une tubulopathie aigüe."
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "3"
- ```python
- text = "Patient transplanté rénal"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [transplanté rénal]
- ```
-
-
-
-=== "4"
- ```python
- text = "Présence d'une insuffisance rénale aigüe sur chronique"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [insuffisance rénale aigüe sur chronique]
- ```
-
-
-
-=== "5"
- ```python
- text = "Le patient a été dialysé"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "6"
- ```python
- text = "Le patient est dialysé chaque lundi"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [dialysé chaque lundi]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'chronic': [lundi]}
- ```
-
-
-
-=== "7"
- ```python
- text = "Présence d'une IRC"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "8"
- ```python
- text = "Présence d'une IRC sévère"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [IRC sévère]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'status': sévère}
- ```
-
-
-
-=== "9"
- ```python
- text = "Présence d'une IRC au stade IV"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [IRC au stade IV]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'stage': IV}
- ```
-
-
-
-=== "10"
- ```python
- text = "Présence d'une IRC avec DFG à 30"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: [IRC avec DFG à 30]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'dfg': 30}
- ```
-
-
-
-=== "11"
- ```python
- text = "Présence d'une maladie rénale avec DFG à 110"
- doc = nlp(text)
- spans = doc.spans["CKD"]
-
- spans
- # Out: []
- ```
-
-## Authors and citation
-
-The `eds.CKD` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/COPD.md b/docs/pipelines/ner/disorders/COPD.md
deleted file mode 100644
index 46c259623..000000000
--- a/docs/pipelines/ner/disorders/COPD.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# COPD
-
-The `eds.COPD` pipeline component extracts mentions of COPD (*Chronic obstructive pulmonary disease*). It will notably match:
-
-- Mentions of various diseases (see below)
-- Pulmonary hypertension
-- Long-term oxygen therapy
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/COPD/patterns.py"
- # fmt: on
- ```
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.COPD")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Une fibrose interstitielle diffuse idiopathique"
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: [fibrose interstitielle diffuse idiopathique]
- ```
-
-
-
-=== "2"
- ```python
- text = "Patient atteint de pneumoconiose"
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: [pneumoconiose]
- ```
-
-
-
-=== "3"
- ```python
- text = "Présence d'une HTAP."
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: [HTAP]
- ```
-
-
-
-=== "4"
- ```python
- text = "On voit une hypertension pulmonaire minime"
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "5"
- ```python
- text = "La patiente a été mis sous oxygénorequérance"
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "6"
- ```python
- text = "La patiente est sous oxygénorequérance au long cours"
- doc = nlp(text)
- spans = doc.spans["COPD"]
-
- spans
- # Out: [oxygénorequérance au long cours]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'long': [long cours]}
- ```
-
-## Authors and citation
-
-The `eds.COPD` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/aids.md b/docs/pipelines/ner/disorders/aids.md
new file mode 100644
index 000000000..1061cc83b
--- /dev/null
+++ b/docs/pipelines/ner/disorders/aids.md
@@ -0,0 +1,8 @@
+# AIDS {: #edsnlp.pipelines.ner.disorders.aids.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.aids.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/cerebrovascular-accident.md b/docs/pipelines/ner/disorders/cerebrovascular-accident.md
new file mode 100644
index 000000000..d48bc9d8c
--- /dev/null
+++ b/docs/pipelines/ner/disorders/cerebrovascular-accident.md
@@ -0,0 +1,8 @@
+# Cerebrovascular accident {: #edsnlp.pipelines.ner.disorders.cerebrovascular_accident.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.cerebrovascular_accident.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/ckd.md b/docs/pipelines/ner/disorders/ckd.md
new file mode 100644
index 000000000..e17f92c3e
--- /dev/null
+++ b/docs/pipelines/ner/disorders/ckd.md
@@ -0,0 +1,8 @@
+# CKD {: #edsnlp.pipelines.ner.disorders.ckd.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.ckd.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/congestive-heart-failure.md b/docs/pipelines/ner/disorders/congestive-heart-failure.md
new file mode 100644
index 000000000..8651dbeef
--- /dev/null
+++ b/docs/pipelines/ner/disorders/congestive-heart-failure.md
@@ -0,0 +1,8 @@
+# Congestive heart failure {: #edsnlp.pipelines.ner.disorders.congestive_heart_failure.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.congestive_heart_failure.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/congestive_heart_failure.md b/docs/pipelines/ner/disorders/congestive_heart_failure.md
deleted file mode 100644
index 462a4e7c0..000000000
--- a/docs/pipelines/ner/disorders/congestive_heart_failure.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Congestive heart failure
-
-The `eds.congestive_heart_failure` pipeline component extracts mentions of congestive heart failure. It will notably match:
-
-- Mentions of various diseases (see below)
-- Heart transplantation
-- AF (Atrial Fibrilation)
-- Pace maker
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/congestive_heart_failure/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.congestive_heart_failure")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Présence d'un oedème pulmonaire"
- doc = nlp(text)
- spans = doc.spans["congestive_heart_failure"]
-
- spans
- # Out: [oedème pulmonaire]
- ```
-
-
-
-=== "2"
- ```python
- text = "Le patient est équipé d'un pace-maker"
- doc = nlp(text)
- spans = doc.spans["congestive_heart_failure"]
-
- spans
- # Out: [pace-maker]
- ```
-
-
-
-=== "3"
- ```python
- text = "Un cardiopathie non décompensée"
- doc = nlp(text)
- spans = doc.spans["congestive_heart_failure"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "Insuffisance cardiaque"
- doc = nlp(text)
- spans = doc.spans["congestive_heart_failure"]
-
- spans
- # Out: [Insuffisance cardiaque]
- ```
-
-
-
-=== "5"
- ```python
- text = "Insuffisance cardiaque minime"
- doc = nlp(text)
- spans = doc.spans["congestive_heart_failure"]
-
- spans
- # Out: []
- ```
-
-## Authors and citation
-
-The `eds.congestive_heart_failure` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/connective-tissue-disease.md b/docs/pipelines/ner/disorders/connective-tissue-disease.md
new file mode 100644
index 000000000..e9364ee26
--- /dev/null
+++ b/docs/pipelines/ner/disorders/connective-tissue-disease.md
@@ -0,0 +1,8 @@
+# Connective tissue disease {: #edsnlp.pipelines.ner.disorders.connective_tissue_disease.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.connective_tissue_disease.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/connective_tissue_disease.md b/docs/pipelines/ner/disorders/connective_tissue_disease.md
deleted file mode 100644
index 97e72180d..000000000
--- a/docs/pipelines/ner/disorders/connective_tissue_disease.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Connective tissue disease
-
-The `eds.connective_tissue_disease` pipeline component extracts mentions of connective tissue diseases.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/connective_tissue_disease/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.connective_tissue_disease")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Présence d'une sclérodermie."
- doc = nlp(text)
- spans = doc.spans["connective_tissue_disease"]
-
- spans
- # Out: [sclérodermie]
- ```
-
-
-
-=== "2"
- ```python
- text = "Patient atteint d'un lupus."
- doc = nlp(text)
- spans = doc.spans["connective_tissue_disease"]
-
- spans
- # Out: [lupus]
- ```
-
-
-
-=== "3"
- ```python
- text = "Présence d'anticoagulants lupiques,"
- doc = nlp(text)
- spans = doc.spans["connective_tissue_disease"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "Il y a une MICI."
- doc = nlp(text)
- spans = doc.spans["connective_tissue_disease"]
-
- spans
- # Out: [MICI]
- ```
-
-
-
-=== "5"
- ```python
- text = "Syndrome de Raynaud"
- doc = nlp(text)
- spans = doc.spans["connective_tissue_disease"]
-
- spans
- # Out: [Raynaud]
- ```
-
-## Authors and citation
-
-The `eds.connective_tissue_disease` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/copd.md b/docs/pipelines/ner/disorders/copd.md
new file mode 100644
index 000000000..f9154630d
--- /dev/null
+++ b/docs/pipelines/ner/disorders/copd.md
@@ -0,0 +1,8 @@
+# COPD {: #edsnlp.pipelines.ner.disorders.copd.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.copd.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/dementia.md b/docs/pipelines/ner/disorders/dementia.md
index 3fb2eae41..320040a4d 100644
--- a/docs/pipelines/ner/disorders/dementia.md
+++ b/docs/pipelines/ner/disorders/dementia.md
@@ -1,101 +1,8 @@
-# Dementia
-
-The `eds.dementia` pipeline component extracts mentions of dementia.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/dementia/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.dementia")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "D'importants déficits cognitifs"
- doc = nlp(text)
- spans = doc.spans["dementia"]
-
- spans
- # Out: [déficits cognitifs]
- ```
-
-
-
-=== "2"
- ```python
- text = "Patient atteint de démence"
- doc = nlp(text)
- spans = doc.spans["dementia"]
-
- spans
- # Out: [démence]
- ```
-
-
-
-=== "3"
- ```python
- text = "On retrouve des anti-SLA"
- doc = nlp(text)
- spans = doc.spans["dementia"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "Une maladie de Charcot"
- doc = nlp(text)
- spans = doc.spans["dementia"]
-
- spans
- # Out: [maladie de Charcot]
- ```
-
-## Authors and citation
-
-The `eds.dementia` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+# Dementia {: #edsnlp.pipelines.ner.disorders.dementia.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.dementia.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/diabetes.md b/docs/pipelines/ner/disorders/diabetes.md
index 3f3f09895..4013e3235 100644
--- a/docs/pipelines/ner/disorders/diabetes.md
+++ b/docs/pipelines/ner/disorders/diabetes.md
@@ -1,164 +1,8 @@
-# Diabetes
-
-The `eds.diabetes` pipeline component extracts mentions of diabetes.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/diabetes/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to either
- - `"WITH_COMPLICATION"` if the diabetes is complicated (e.g., via organ damages)
- - `"WITHOUT_COMPLICATION"` else
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `type`: type of diabetes (I or II)
- - `insulin`: if the diabetes is insulin-dependent
- - `cortico`: if the diabetes if corticoid-induced
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.diabetes")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Présence d'un DT2"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [DT2]
- ```
-
-
-
-=== "2"
- ```python
- text = "Présence d'un DNID"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [DNID]
- ```
-
-
-
-=== "3"
- ```python
- text = "Patient diabétique"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [diabétique]
- ```
-
-
-
-=== "4"
- ```python
- text = "Un diabète insipide"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "5"
- ```python
- text = "Atteinte neurologique d'origine diabétique"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [origine diabétique]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: WITH_COMPLICATION
-
- span._.assigned
- # Out: {'complicated_before': [origine]}
- ```
-
-
-
-=== "6"
- ```python
- text = "Une rétinopathie diabétique"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [rétinopathie diabétique]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: WITH_COMPLICATION
-
- span._.assigned
- # Out: {'complicated_before': [rétinopathie]}
- ```
-
-
-
-=== "7"
- ```python
- text = "Il y a un mal perforant plantaire"
- doc = nlp(text)
- spans = doc.spans["diabetes"]
-
- spans
- # Out: [mal perforant plantaire]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: WITH_COMPLICATION
- ```
-
-## Authors and citation
-
-The `eds.diabetes` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+# Diabetes {: #edsnlp.pipelines.ner.disorders.diabetes.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.diabetes.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/hemiplegia.md b/docs/pipelines/ner/disorders/hemiplegia.md
index b76cbb0e4..39a3b453b 100644
--- a/docs/pipelines/ner/disorders/hemiplegia.md
+++ b/docs/pipelines/ner/disorders/hemiplegia.md
@@ -1,89 +1,8 @@
-# Hemiplegia
-
-The `eds.hemiplegia` pipeline component extracts mentions of hemiplegia.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/hemiplegia/patterns.py"
- # fmt: on
- ```
-
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.hemiplegia")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Patient hémiplégique"
- doc = nlp(text)
- spans = doc.spans["hemiplegia"]
-
- spans
- # Out: [hémiplégique]
- ```
-
-
-
-=== "2"
- ```python
- text = "Paralysie des membres inférieurs"
- doc = nlp(text)
- spans = doc.spans["hemiplegia"]
-
- spans
- # Out: [Paralysie des membres]
- ```
-
-
-
-=== "3"
- ```python
- text = "Patient en LIS"
- doc = nlp(text)
- spans = doc.spans["hemiplegia"]
-
- spans
- # Out: [LIS]
- ```
-
-## Authors and citation
-
-The `eds.hemiplegia` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+# Hemiplegia {: #edsnlp.pipelines.ner.disorders.hemiplegia.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.hemiplegia.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/index.md b/docs/pipelines/ner/disorders/index.md
deleted file mode 100644
index 5cad8ab0d..000000000
--- a/docs/pipelines/ner/disorders/index.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Disorders
-
-## Presentation
-
-The following components extract various mentions of disorders. At the moment, the available components match the 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component.
-Some general considerations about those components:
-
-- Extracted entities are stored in the `doc.spans` dictionary. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
-- The comorbidity is also available under the `ent.label_` of each match.
-- Matches have an associated `_.status` attribute taking the value `0`, `1`, or `2`. A corresponding `_.detailled_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
-- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
-- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
-
- ```python
- nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
- )
- ```
-
-- Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developped and trained specificaly for those components. For privacy reason, the model isn't publicly available yet.
-
- !!! aphp "Use the ML model"
-
- The model will soon be available in the models catalogue of AP-HP's CDW.
-
-!!! tip "On the medical definition of the comorbidities"
-
- Those components were developped to extract **chronic** and **symptomatic** conditions only.
-
-## Aggregation
-
-For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*").
-Thus, a good and simple aggregation rule is, for each comorbidity, to
-
-- disregard all entities tagged as irrelevant by the qualification component(s)
-- take the maximum (i.e., the most severe) status of the leftover entities
-
-An implementation of this rule is presented [here][aggregating-results]
-
-## Usage
-
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe("eds.tobacco")
-nlp.add_pipe("eds.diabetes")
-
-text = """
-Compte-rendu de consultation.
-
-Je vois ce jour M. SCOTT pour le suivi de sa rétinopathie diabétique.
-Le patient va bien depuis la dernière fois.
-Je le félicite pour la poursuite de son sevrage tabagique (toujours à 10 paquet-année).
-
-Sur le plan de son diabète, la glycémie est stable.
-"""
-
-doc = nlp(text)
-
-doc.spans
-# Out: {
-# 'pollutions': [],
-# 'tobacco': [sevrage tabagique (toujours à 10 paquet-année],
-# 'diabetes': [rétinopathie diabétique, diabète]
-# }
-
-tobacco = doc.spans["tobacco"]
-tobacco[0]._.detailled_status
-# Out: "ABSTINENCE"
-
-tobacco[0]._.assigned["PA"] # paquet-année
-# Out: 10 # (1)
-
-
-diabetes = doc.spans["diabetes"]
-(diabetes[0]._.detailled_status, diabetes[1]._.detailled_status)
-# Out: ('WITH_COMPLICATION', 'WITHOUT_COMPLICATION') # (2)
-```
-
-1. Here we see an example of additional information that can be extracted
-2. Here we see the importance of document-level aggregation to extract the correct severity of each condition.
diff --git a/docs/pipelines/ner/disorders/leukemia.md b/docs/pipelines/ner/disorders/leukemia.md
index 7335178d1..07487e6a7 100644
--- a/docs/pipelines/ner/disorders/leukemia.md
+++ b/docs/pipelines/ner/disorders/leukemia.md
@@ -1,101 +1,8 @@
-# Leukemia
-
-The `eds.leukemia` pipeline component extracts mentions of leukemia.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/leukemia/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.leukemia")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Sydrome myéloprolifératif"
- doc = nlp(text)
- spans = doc.spans["leukemia"]
-
- spans
- # Out: [myéloprolifératif]
- ```
-
-
-
-=== "2"
- ```python
- text = "Sydrome myéloprolifératif bénin"
- doc = nlp(text)
- spans = doc.spans["leukemia"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "3"
- ```python
- text = "Patient atteint d'une LAM"
- doc = nlp(text)
- spans = doc.spans["leukemia"]
-
- spans
- # Out: [LAM]
- ```
-
-
-
-=== "4"
- ```python
- text = "Une maladie de Vaquez"
- doc = nlp(text)
- spans = doc.spans["leukemia"]
-
- spans
- # Out: [Vaquez]
- ```
-
-## Authors and citation
-
-The `eds.leukemia` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+# Leukemia {: #edsnlp.pipelines.ner.disorders.leukemia.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.leukemia.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/liver-disease.md b/docs/pipelines/ner/disorders/liver-disease.md
new file mode 100644
index 000000000..262f8ed5a
--- /dev/null
+++ b/docs/pipelines/ner/disorders/liver-disease.md
@@ -0,0 +1,8 @@
+# Liver disease {: #edsnlp.pipelines.ner.disorders.liver_disease.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.liver_disease.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/liver_disease.md b/docs/pipelines/ner/disorders/liver_disease.md
deleted file mode 100644
index a687952bd..000000000
--- a/docs/pipelines/ner/disorders/liver_disease.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Liver disease
-
-The `eds.liver_disease` pipeline component extracts mentions of liver disease.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/liver_disease/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to either
- - `"MILD"` for mild liver diseases
- - `"MODERATE_TO_SEVERE"` else
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.liver_disease")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Il y a une fibrose hépatique"
- doc = nlp(text)
- spans = doc.spans["liver_disease"]
-
- spans
- # Out: [fibrose hépatique]
- ```
-
-
-
-=== "2"
- ```python
- text = "Une hépatite B chronique"
- doc = nlp(text)
- spans = doc.spans["liver_disease"]
-
- spans
- # Out: [hépatite B chronique]
- ```
-
-
-
-=== "3"
- ```python
- text = "Le patient consulte pour une cirrhose"
- doc = nlp(text)
- spans = doc.spans["liver_disease"]
-
- spans
- # Out: [cirrhose]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: MODERATE_TO_SEVERE
- ```
-
-
-
-=== "4"
- ```python
- text = "Greffe hépatique."
- doc = nlp(text)
- spans = doc.spans["liver_disease"]
-
- spans
- # Out: [Greffe hépatique]
-
- span = spans[0]
-
- span._.detailled_status
- # Out: MODERATE_TO_SEVERE
- ```
-
-## Authors and citation
-
-The `eds.liver_disease` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/lymphoma.md b/docs/pipelines/ner/disorders/lymphoma.md
index b1a659726..20dea7ba2 100644
--- a/docs/pipelines/ner/disorders/lymphoma.md
+++ b/docs/pipelines/ner/disorders/lymphoma.md
@@ -1,106 +1,8 @@
-# Lymphoma
-
-The `eds.lymphoma` pipeline component extracts mentions of lymphoma.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/lymphoma/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-!!! warning "Monoclonal gammapathy"
-
- Monoclonal gammapathies are not extracted by this pipeline
-
-## Usage
-
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.lymphoma")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Un lymphome de Hodgkin."
- doc = nlp(text)
- spans = doc.spans["lymphoma"]
-
- spans
- # Out: [lymphome de Hodgkin]
- ```
-
-
-
-=== "2"
- ```python
- text = "Atteint d'un Waldenstörm"
- doc = nlp(text)
- spans = doc.spans["lymphoma"]
-
- spans
- # Out: [Waldenstörm]
- ```
-
-
-
-=== "3"
- ```python
- text = "Un LAGC"
- doc = nlp(text)
- spans = doc.spans["lymphoma"]
-
- spans
- # Out: [LAGC]
- ```
-
-
-
-=== "4"
- ```python
- text = "anti LAGC: 10^4/mL"
- doc = nlp(text)
- spans = doc.spans["lymphoma"]
-
- spans
- # Out: []
- ```
-
-## Authors and citation
-
-The `eds.lymphoma` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
+# Lymphoma {: #edsnlp.pipelines.ner.disorders.lymphoma.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.lymphoma.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/myocardial-infarction.md b/docs/pipelines/ner/disorders/myocardial-infarction.md
new file mode 100644
index 000000000..8b01977ff
--- /dev/null
+++ b/docs/pipelines/ner/disorders/myocardial-infarction.md
@@ -0,0 +1,8 @@
+# Myocardial infarction {: #edsnlp.pipelines.ner.disorders.myocardial_infarction.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.myocardial_infarction.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/myocardial_infarction.md b/docs/pipelines/ner/disorders/myocardial_infarction.md
deleted file mode 100644
index 85b30399a..000000000
--- a/docs/pipelines/ner/disorders/myocardial_infarction.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# Myocardial infarction
-
-The `eds.myocardial_infarction` pipeline component extracts mentions of myocardial infarction. It will notably match:
-
-- Mentions of various diseases (see below)
-- Mentions of stents with a heart localization
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/myocardial_infarction/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-- `span._.assigned`: dictionary with the following keys, if relevant:
- - `heart_localized`: localization of the stent or bypass
-
-## Usage
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.myocardial_infarction")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Une cardiopathie ischémique"
- doc = nlp(text)
- spans = doc.spans["myocardial_infarction"]
-
- spans
- # Out: [cardiopathie ischémique]
- ```
-
-
-
-=== "2"
- ```python
- text = "Une cardiopathie non-ischémique"
- doc = nlp(text)
- spans = doc.spans["myocardial_infarction"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "3"
- ```python
- text = "Présence d'un stent sur la marginale"
- doc = nlp(text)
- spans = doc.spans["myocardial_infarction"]
-
- spans
- # Out: [stent sur la marginale]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'heart_localized': [marginale]}
- ```
-
-
-
-=== "4"
- ```python
- text = "Présence d'un stent périphérique"
- doc = nlp(text)
- spans = doc.spans["myocardial_infarction"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "5"
- ```python
- text = "infarctus du myocarde"
- doc = nlp(text)
- spans = doc.spans["myocardial_infarction"]
-
- spans
- # Out: [infarctus du myocarde]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'heart_localized': [myocarde]}
- ```
-
-## Authors and citation
-
-The `eds.myocardial_infarction` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/overview.md b/docs/pipelines/ner/disorders/overview.md
new file mode 100644
index 000000000..971db10ad
--- /dev/null
+++ b/docs/pipelines/ner/disorders/overview.md
@@ -0,0 +1,53 @@
+# Disorders
+
+## Presentation
+
+The following components extract 16 different conditions from the [Charlson Comorbidity Index](https://www.rdplf.org/calculateurs/pages/charlson/charlson.html). Each component is based on the ContextualMatcher component.
+Some general considerations about those components:
+
+- Extracted entities are stored in `doc.ents` and `doc.spans`. For instance, the `eds.tobacco` component stores matches in `doc.spans["tobacco"]`.
+- The matched comorbidity is also available under the `ent.label_` of each match.
+- Matches have an associated `_.status` attribute taking the value `0`, `1`, or `2`. A corresponding `_.detailed_status` attribute stores the human-readable status, which can be component-dependent. See each component documentation for more details.
+- Some components add additional information to matches. For instance, the `tobacco` adds, if relevant, extracted *pack-year* (= *paquet-année*). Those information are available under the `ent._.assigned` attribute.
+- Those components work on **normalized** documents. Please use the `eds.normalizer` pipeline with the following parameters:
+
+ ```{ .python .no-check }
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ ```
+
+- Those components **should be used with a qualification pipeline** to avoid extracted unwanted matches. At the very least, you can use available rule-based qualifiers (`eds.negation`, `eds.hypothesis` and `eds.family`). Better, a machine learning qualification component was developed and trained specifically for those components. For privacy reason, the model isn't publicly available yet.
+
+ !!! aphp "Use the ML model"
+
+ The model will soon be available in the models catalogue of AP-HP's CDW.
+
+!!! tip "On the medical definition of the comorbidities"
+
+ Those components were developped to extract **chronic** and **symptomatic** conditions only.
+
+## Aggregation
+
+For relevant phenotyping, matches should be aggregated at the document-level. For instance, a document might mention a complicated diabetes at the beginning ("*Le patient a une rétinopathie diabétique*"), and then refer to this diabetes without mentionning that it is complicated anymore ("*Concernant son diabète, le patient ...*").
+Thus, a good and simple aggregation rule is, for each comorbidity, to
+
+- disregard all entities tagged as irrelevant by the qualification component(s)
+- take the maximum (i.e., the most severe) status of the leftover entities
+
+An implementation of this rule is presented [here][aggregating-results]
diff --git a/docs/pipelines/ner/disorders/peptic-ulcer-disease.md b/docs/pipelines/ner/disorders/peptic-ulcer-disease.md
new file mode 100644
index 000000000..bb6089ead
--- /dev/null
+++ b/docs/pipelines/ner/disorders/peptic-ulcer-disease.md
@@ -0,0 +1,8 @@
+# Peptic ulcer disease {: #edsnlp.pipelines.ner.disorders.peptic_ulcer_disease.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.peptic_ulcer_disease.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/peptic_ulcer_disease.md b/docs/pipelines/ner/disorders/peptic_ulcer_disease.md
deleted file mode 100644
index 9bb601d1b..000000000
--- a/docs/pipelines/ner/disorders/peptic_ulcer_disease.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Peptic ulcer disease
-
-The `eds.peptic_ulcer_disease` pipeline component extracts mentions of peptic ulcer disease.
-
-??? info "Details of the used patterns"
-
- ```python
- # fmt: off
- --8<-- "edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/patterns.py"
- # fmt: on
- ```
-
-## Extensions
-
-On each span `span` that match, the following attributes are available:
-
-- `span._.detailled_status`: set to `"PRESENT"`
-
-## Usage
-
-
-```python
-import spacy
-
-nlp = spacy.blank("eds")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe(
- "eds.normalizer",
- config=dict(
- accents=True,
- lowercase=True,
- quotes=True,
- spaces=True,
- pollution=dict(
- information=True,
- bars=True,
- biology=True,
- doctors=True,
- web=True,
- coding=True,
- footer=True,
- ),
- ),
-)
-nlp.add_pipe(f"eds.peptic_ulcer_disease")
-```
-
-Below are a few examples:
-
-
-
-
-=== "1"
- ```python
- text = "Beaucoup d'ulcères gastriques"
- doc = nlp(text)
- spans = doc.spans["peptic_ulcer_disease"]
-
- spans
- # Out: [ulcères gastriques]
- ```
-
-
-
-=== "2"
- ```python
- text = "Présence d'UGD"
- doc = nlp(text)
- spans = doc.spans["peptic_ulcer_disease"]
-
- spans
- # Out: [UGD]
- ```
-
-
-
-=== "3"
- ```python
- text = "La patient à des ulcères"
- doc = nlp(text)
- spans = doc.spans["peptic_ulcer_disease"]
-
- spans
- # Out: []
- ```
-
-
-
-=== "4"
- ```python
- text = "Au niveau gastrique: blabla blabla blabla blabla blabla quelques ulcères"
- doc = nlp(text)
- spans = doc.spans["peptic_ulcer_disease"]
-
- spans
- # Out: [ulcères]
-
- span = spans[0]
-
- span._.assigned
- # Out: {'is_peptic': [gastrique]}
- ```
-
-## Authors and citation
-
-The `eds.peptic_ulcer_disease` component was developed by AP-HP's Data Science team with a team of medical experts. A paper describing in details the development of those components is being drafted and will soon be available.
diff --git a/docs/pipelines/ner/disorders/peripheral-vascular-disease.md b/docs/pipelines/ner/disorders/peripheral-vascular-disease.md
new file mode 100644
index 000000000..df5000115
--- /dev/null
+++ b/docs/pipelines/ner/disorders/peripheral-vascular-disease.md
@@ -0,0 +1,8 @@
+# Peripheral vascular disease {: #edsnlp.pipelines.ner.disorders.peripheral_vascular_disease.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.peripheral_vascular_disease.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/disorders/solid-tumor.md b/docs/pipelines/ner/disorders/solid-tumor.md
new file mode 100644
index 000000000..e8e036ffb
--- /dev/null
+++ b/docs/pipelines/ner/disorders/solid-tumor.md
@@ -0,0 +1,8 @@
+# Solid tumor {: #edsnlp.pipelines.ner.disorders.solid_tumor.factory.create_component }
+
+::: edsnlp.pipelines.ner.disorders.solid_tumor.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/drugs.md b/docs/pipelines/ner/drugs.md
index abc9aa9f4..7d1d448c4 100644
--- a/docs/pipelines/ner/drugs.md
+++ b/docs/pipelines/ner/drugs.md
@@ -1,49 +1,8 @@
-# Drugs
-
-The `eds.drugs` pipeline component detects mentions of French drugs (brand names and active ingredients) and adds them to `doc.ents`.
-Each drug is mapped to an [ATC](https://en.wikipedia.org/wiki/Anatomical_Therapeutic_Chemical_Classification_System) code through
-the Romedi terminology [@cossin:hal-02987843]. The ATC classifies drugs into groups.
-
-## Usage
-
-In this example, we are looking for an oral antidiabetic medication (ATC code: A10B).
-
-```python
-from edsnlp.pipelines.core.terminology import TerminologyTermMatcher
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.drugs", config=dict(term_matcher=TerminologyTermMatcher.exact))
-
-text = "Traitement habituel: Kardégic, cardensiel (bisoprolol), glucophage, lasilix"
-
-doc = nlp(text)
-
-drugs_detected = [(x.text, x.kb_id_) for x in doc.ents]
-
-drugs_detected
-# Out: [('Kardégic', 'B01AC06'), ('cardensiel', 'C07AB07'), ('bisoprolol', 'C07AB07'), ('glucophage', 'A10BA02'), ('lasilix', 'C03CA01')]
-
-oral_antidiabetics_detected = list(
- filter(lambda x: (x[1].startswith("A10B")), drugs_detected)
-)
-oral_antidiabetics_detected
-# Out: [('glucophage', 'A10BA02')]
-```
-
-Glucophage is the brand name of a medication that contains metformine, the first-line medication for the treatment of type 2 diabetes.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# Drugs {: #edsnlp.pipelines.ner.drugs.factory.create_component }
::: edsnlp.pipelines.ner.drugs.factory.create_component
options:
- only_parameters: true
-
-## Authors and citation
-
-The `eds.drugs` pipeline was developed by the IAM team and CHU de Bordeaux's Data Science team.
-
-\bibliography
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/index.md b/docs/pipelines/ner/index.md
deleted file mode 100644
index aca254bb6..000000000
--- a/docs/pipelines/ner/index.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Named entity recognition
-
-We provide a few Named Entity Recognition (NER) pipelines.
diff --git a/docs/pipelines/ner/overview.md b/docs/pipelines/ner/overview.md
new file mode 100644
index 000000000..e7271cd6e
--- /dev/null
+++ b/docs/pipelines/ner/overview.md
@@ -0,0 +1,56 @@
+# Named Entity Recognition Components
+
+We provide several Named Entity Recognition (NER) components.
+Named Entity Recognition is the task of identifying short relevant spans of text, named entities, and classifying them into pre-defined categories.
+In the case of clinical documents, these entities can be scores, disorders, behaviors, codes, dates, measurements, etc.
+
+## Span setters: where are stored extracted entities ? {: #edsnlp.pipelines.base.SpanSetterArg }
+
+A component assigns entities to a document by adding them to the `doc.ents` or `doc.spans[group]` attributes. `doc.ents` only supports non overlapping
+entities, therefore, if two entities overlap, the longest one will be kept. `doc.spans[group]` on the other hand, can contain overlapping entities.
+To control where entities are added, you can use the `span_setter` argument in any of these component.
+
+::: edsnlp.pipelines.base.SpanSetterArg
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
+
+## Available components
+
+
+
+| Component | Description |
+|-------------------------------------------------------------------------------------------|---------------------------------------|
+| [`eds.covid`](/pipelines/ner/covid) | A COVID mentions detector |
+| [`eds.charlson`](/pipelines/ner/scores/charlson) | A Charlson score extractor |
+| [`eds.sofa`](/pipelines/ner/scores/sofa) | A SOFA score extractor |
+| [`eds.elston_ellis`](/pipelines/ner/scores/elston-ellis) | An Elston & Ellis code extractor |
+| [`eds.emergency_priority`](/pipelines/ner/scores/emergency-priority) | A priority score extractor |
+| [`eds.emergency_ccmu`](/pipelines/ner/scores/emergency-ccmu) | A CCMU score extractor |
+| [`eds.emergency_gemsa`](/pipelines/ner/scores/emergency-gemsa) | A GEMSA score extractor |
+| [`eds.tnm`](/pipelines/ner/tnm) | A TNM score extractor |
+| [`eds.adicap`](/pipelines/ner/adicap) | A ADICAP codes extractor |
+| [`eds.drugs`](/pipelines/ner/drugs) | A drug mentions extractor |
+| [`eds.cim10`](/pipelines/ner/cim10) | A CIM10 terminology matcher |
+| [`eds.umls`](/pipelines/ner/umls) | An UMLS terminology matcher |
+| [`eds.ckd`](/pipelines/ner/disorders/ckd) | CKD extractor |
+| [`eds.copd`](/pipelines/ner/disorders/copd) | COPD extractor |
+| [`eds.cerebrovascular_accident`](/pipelines/ner/disorders/cerebrovascular-accident) | Cerebrovascular accident extractor |
+| [`eds.congestive_heart_failure`](/pipelines/ner/disorders/congestive-heart-failure) | Congestive heart failure extractor |
+| [`eds.connective_tissue_disease`](/pipelines/ner/disorders/connective-tissue-disease) | Connective tissue disease extractor |
+| [`eds.dementia`](/pipelines/ner/disorders/dementia) | Dementia extractor |
+| [`eds.diabetes`](/pipelines/ner/disorders/diabetes) | Diabetes extractor |
+| [`eds.hemiplegia`](/pipelines/ner/disorders/hemiplegia) | Hemiplegia extractor |
+| [`eds.leukemia`](/pipelines/ner/disorders/leukemia) | Leukemia extractor |
+| [`eds.liver_disease`](/pipelines/ner/disorders/liver-disease) | Liver disease extractor |
+| [`eds.lymphoma`](/pipelines/ner/disorders/lymphoma) | Lymphoma extractor |
+| [`eds.myocardial_infarction`](/pipelines/ner/disorders/myocardial-infarction) | Myocardial infarction extractor |
+| [`eds.peptic_ulcer_disease`](/pipelines/ner/disorders/peptic-ulcer-disease) | Peptic ulcer disease extractor |
+| [`eds.peripheral_vascular_disease`](/pipelines/ner/disorders/peripheral-vascular-disease) | Peripheral vascular disease extractor |
+| [`eds.solid_tumor`](/pipelines/ner/disorders/solid-tumor) | Solid tumor extractor |
+| [`eds.alcohol`](/pipelines/ner/behaviors/alcohol) | Alcohol consumption extractor |
+| [`eds.tobacco`](/pipelines/ner/behaviors/tobacco) | Tobacco consumption extractor |
+
+
diff --git a/docs/pipelines/ner/score.md b/docs/pipelines/ner/score.md
deleted file mode 100644
index ec56035ee..000000000
--- a/docs/pipelines/ner/score.md
+++ /dev/null
@@ -1,160 +0,0 @@
-# Score
-
-The `eds.score` pipeline allows easy extraction of typical scores (Charlson, SOFA...) that can be found in clinical documents.
-The pipeline works by
-
-- Extracting the score's name via the provided regular expressions
-- Extracting the score's _raw_ value via another set of RegEx
-- Normalising the score's value via a normalising function
-
-## Charlson Comorbidity Index
-
-Implementing the `eds.score` pipeline, the `charlson` pipeline will extract the [Charlson Comorbidity Index](https://www.mdcalc.com/charlson-comorbidity-index-cci):
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.charlson")
-
-text = "Charlson à l'admission: 7.\n" "Charlson: \n" "OMS: \n"
-
-doc = nlp(text)
-doc.ents
-# Out: (Charlson à l'admission: 7,)
-```
-
-We can see that only one occurrence was extracted. The second mention of Charlson in the text
-doesn't contain any numerical value, so it isn't extracted.
-
-Each extraction exposes 2 extensions:
-
-```python
-ent = doc.ents[0]
-
-ent._.score_name
-# Out: 'eds.charlson'
-
-ent._.score_value
-# Out: 7
-```
-
-## SOFA score
-
-The `SOFA` pipe allows to extract [Sequential Organ Failure Assessment (SOFA) scores](https://www.mdcalc.com/calc/691/sequential-organ-failure-assessment-sofa-score), used to track a person's status during the stay in an intensive care unit to determine the extent of a person's organ function or rate failure.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.SOFA")
-
-text = "SOFA (à 24H) : 12.\n" "OMS: \n"
-
-doc = nlp(text)
-doc.ents
-# Out: (SOFA (à 24H) : 12,)
-```
-
-Each extraction exposes 3 extensions:
-
-```python
-ent = doc.ents[0]
-
-ent._.score_name
-# Out: 'eds.SOFA'
-
-ent._.score_value
-# Out: 12
-
-ent._.score_method
-# Out: '24H'
-```
-
-Score method can here be "24H", "Maximum", "A l'admission" or "Non précisée"
-
-## TNM score
-
-The `eds.TNM` pipe allows to extract TNM scores.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe("eds.TNM")
-
-text = "TNM: pTx N1 M1"
-
-doc = nlp(text)
-doc.ents
-# Out: (pTx N1 M1,)
-
-ent = doc.ents[0]
-ent._.value.dict()
-# {'modifier': 'p',
-# 'tumour': None,
-# 'tumour_specification': 'x',
-# 'node': '1',
-# 'node_specification': None,
-# 'metastasis': '1',
-# 'resection_completeness': None,
-# 'version': None,
-# 'version_year': None}
-```
-
-The TNM score is based on the developement of S. Priou, B. Rance and E. Kempf [@kempf:hal-03519085].
-
-## Implementing your own score
-
-Using the `eds.score` pipeline, you only have to change its configuration in order to implement a _simple_ score extraction algorithm. As an example, let us see the configuration used for the `eds.charlson` pipe
-The configuration consists of 4 items:
-
-- `score_name`: The name of the score
-- `regex`: A list of regular expression to detect the score's mention
-- `value_extract`: A regular expression to extract the score's value in the context of the score's mention
-- `score_normalization`: A function name used to normalise the score's _raw_ value
-
-!!! note
-
- spaCy doesn't allow to pass functions in the configuration of a pipeline.
- To circumvent this issue, functions need to be registered, which simply consists in
- decorating those functions
-
-The registration is done as follows:
-
-```python
-@spacy.registry.misc("score_normalization.charlson")
-def my_normalization_score(raw_score: str):
- # Implement some filtering here
- # Return None if you want the score to be discarded
- return normalized_score
-```
-
-The values used for the `eds.charlson` pipe are the following:
-
-```python
-@spacy.registry.misc("score_normalization.charlson")
-def score_normalization(extracted_score):
- """
- Charlson score normalization.
- If available, returns the integer value of the Charlson score.
- """
- score_range = list(range(0, 30))
- if (extracted_score is not None) and (int(extracted_score) in score_range):
- return int(extracted_score)
-
-
-charlson_config = dict(
- score_name="charlson",
- regex=[r"charlson"],
- value_extract=r"charlson.*[\n\W]*(\d+)",
- score_normalization="score_normalization.charlson",
-)
-```
-
-\bibliography
diff --git a/docs/pipelines/ner/scores/charlson.md b/docs/pipelines/ner/scores/charlson.md
new file mode 100644
index 000000000..bed0595a8
--- /dev/null
+++ b/docs/pipelines/ner/scores/charlson.md
@@ -0,0 +1,8 @@
+# Charlson {: #edsnlp.pipelines.ner.scores.charlson.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.charlson.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/scores/elston-ellis.md b/docs/pipelines/ner/scores/elston-ellis.md
new file mode 100644
index 000000000..af8c080cf
--- /dev/null
+++ b/docs/pipelines/ner/scores/elston-ellis.md
@@ -0,0 +1,8 @@
+# Elston-Ellis {: #edsnlp.pipelines.ner.scores.elston_ellis.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.elston_ellis.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/scores/emergency-ccmu.md b/docs/pipelines/ner/scores/emergency-ccmu.md
new file mode 100644
index 000000000..75b4f0707
--- /dev/null
+++ b/docs/pipelines/ner/scores/emergency-ccmu.md
@@ -0,0 +1,8 @@
+# Emergency CCMU {: #edsnlp.pipelines.ner.scores.emergency.ccmu.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.emergency.ccmu.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/scores/emergency-gemsa.md b/docs/pipelines/ner/scores/emergency-gemsa.md
new file mode 100644
index 000000000..9597bb35d
--- /dev/null
+++ b/docs/pipelines/ner/scores/emergency-gemsa.md
@@ -0,0 +1,8 @@
+# Emergency GEMSA {: #edsnlp.pipelines.ner.scores.emergency.gemsa.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.emergency.gemsa.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/scores/emergency-priority.md b/docs/pipelines/ner/scores/emergency-priority.md
new file mode 100644
index 000000000..7a0205954
--- /dev/null
+++ b/docs/pipelines/ner/scores/emergency-priority.md
@@ -0,0 +1,8 @@
+# Emergency Priority {: #edsnlp.pipelines.ner.scores.emergency.priority.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.emergency.priority.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/scores/overview.md b/docs/pipelines/ner/scores/overview.md
new file mode 100644
index 000000000..16a32ca55
--- /dev/null
+++ b/docs/pipelines/ner/scores/overview.md
@@ -0,0 +1,69 @@
+# Scores Overview
+
+EDS-NLP provides multiple matchers for typical scores (Charlson, SOFA...) found in clinical documents.
+To extract a score, the matcher:
+
+- extracts the score's name via the provided regular expressions
+- extracts the score's _raw_ value via another set of RegEx
+- normalize the score's value via a normalising function
+
+## Available scores
+
+| Component | Description |
+|--------------------------------------------------|-------------------------------|
+| [`eds.charlson`](./charlson) | A Charlson score extractor |
+| [`eds.emergency_ccmu`](./emergency-ccmu) | A CCMU score extractor |
+| [`eds.emergency_gemsa`](./emergency-gemsa) | A GEMSA score extractor |
+| [`eds.emergency_priority`](./emergency-priority) | A priority score extractor |
+| [`eds.sofa`](./sofa) | A SOFA score extractor |
+| [`eds.tnm`](./tnm) | A TNM score extractor |
+
+## Implementing your own score
+
+Using the `eds.score` pipeline, you only have to change its configuration in order to implement a _simple_ score extraction algorithm. As an example, let us see the configuration used for the `eds.charlson` pipe
+The configuration consists of 4 items:
+
+- `score_name`: The name of the score
+- `regex`: A list of regular expression to detect the score's mention
+- `value_extract`: A regular expression to extract the score's value in the context of the score's mention
+- `score_normalization`: A function name used to normalise the score's _raw_ value
+
+!!! note
+
+ Functions passed as parameters to components need to be registered as follow
+
+ ```python
+ import spacy
+
+
+ @spacy.registry.misc("score_normalization.charlson")
+ def my_normalization_score(raw_score: str):
+ # Implement some filtering here
+ # Return None if you want the score to be discarded
+ return normalized_score
+ ```
+
+The values used for the `eds.charlson` pipe are the following:
+
+```python
+import spacy
+
+
+@spacy.registry.misc("score_normalization.charlson")
+def score_normalization(extracted_score):
+ """
+ Charlson score normalization.
+ If available, returns the integer value of the Charlson score.
+ """
+ score_range = list(range(0, 30))
+ if (extracted_score is not None) and (int(extracted_score) in score_range):
+ return int(extracted_score)
+
+
+charlson_config = dict(
+ score_name="charlson",
+ regex=[r"charlson"],
+ value_extract=r"charlson.*[\n\W]*(\d+)",
+ score_normalization="score_normalization.charlson",
+)
+```
diff --git a/docs/pipelines/ner/scores/sofa.md b/docs/pipelines/ner/scores/sofa.md
new file mode 100644
index 000000000..a1b5993c6
--- /dev/null
+++ b/docs/pipelines/ner/scores/sofa.md
@@ -0,0 +1,8 @@
+# SOFA {: #edsnlp.pipelines.ner.scores.sofa.factory.create_component }
+
+::: edsnlp.pipelines.ner.scores.sofa.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/tnm.md b/docs/pipelines/ner/tnm.md
new file mode 100644
index 000000000..7d0261226
--- /dev/null
+++ b/docs/pipelines/ner/tnm.md
@@ -0,0 +1,8 @@
+# TNM {: #edsnlp.pipelines.ner.tnm.factory.create_component }
+
+::: edsnlp.pipelines.ner.tnm.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/ner/umls.md b/docs/pipelines/ner/umls.md
index 4d496dfdb..ead82c6af 100644
--- a/docs/pipelines/ner/umls.md
+++ b/docs/pipelines/ner/umls.md
@@ -1,64 +1,8 @@
-# UMLS
-
-The `eds.umls` pipeline component matches the UMLS (Unified Medical Language System from NIH) terminology.
-
-!!! warning "Very low recall"
-
- When using the `exact` matching mode, this component has a very poor recall performance.
- We can use the `simstring` mode to retrieve approximate matches, albeit at the cost of a significantly higher computation time.
-
-## Usage
-
-`eds.umls` is an additional module that needs to be setup by:
-
-1. `pip install -U umls_downloader`
-2. [Signing up for a UMLS Terminology Services Account](https://uts.nlm.nih.gov/uts/signup-login). After filling a short form, you will receive your token API within a few days.
-3. Set `UMLS_API_KEY` locally: `export UMLS_API_KEY=your_api_key`
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.umls")
-
-text = "Grosse toux: le malade a été mordu par des Amphibiens " "sous le genou"
-
-doc = nlp(text)
-
-doc.ents
-# Out: (toux, a, par, Amphibiens, genou)
-
-ent = doc.ents[0]
-
-ent.label_
-# Out: umls
-
-ent._.umls
-# Out: C0010200
-```
-
-You can easily change the default languages and sources with the `pattern_config` argument:
-
-```python
-import spacy
-
-# Enable the french and english languages, through the french MeSH and LOINC
-pattern_config = dict(languages=["FRE", "ENG"], sources=["MSHFRE", "LNC"])
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.umls", config=dict(pattern_config=pattern_config))
-```
-
-See more options of languages and sources [here](https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html).
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
+# UMLS {: #edsnlp.pipelines.ner.umls.factory.create_component }
::: edsnlp.pipelines.ner.umls.factory.create_component
options:
- only_parameters: true
-
-## Authors and citation
-
-The `eds.umls` pipeline was developed by AP-HP's Data Science team and INRIA SODA's team.
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/overview.md b/docs/pipelines/overview.md
new file mode 100644
index 000000000..c5058ad81
--- /dev/null
+++ b/docs/pipelines/overview.md
@@ -0,0 +1,71 @@
+# Pipes overview
+
+EDS-NLP provides easy-to-use pipeline components (aka pipes).
+
+## Available components
+
+
+
+=== "Core"
+
+ See the [Core components overview](/pipelines/misc/overview/) for more information.
+
+ --8<-- "docs/pipelines/core/overview.md:components"
+
+=== "Qualifiers"
+
+ See the [Qualifiers overview](/pipelines/qualifiers/overview/) for more information.
+
+ --8<-- "docs/pipelines/qualifiers/overview.md:components"
+
+=== "Miscellaneous"
+
+ See the [Miscellaneous components overview](/pipelines/misc/overview/) for more information.
+
+ --8<-- "docs/pipelines/misc/overview.md:components"
+
+=== "NER"
+
+ See the [NER overview](/pipelines/ner/overview/) for more information.
+
+ --8<-- "docs/pipelines/ner/overview.md:components"
+
+=== "Trainable"
+
+ | Pipeline | Description |
+ | -------------------- | -------------------------------------------------------------------- |
+ | `eds.nested-ner` | A trainable component for nested (and classic) NER |
+ | `eds.span-qualifier` | A trainable component for multi-class multi-label span qualification |
+
+
+
+You can add them to your pipeline by simply calling `add_pipe`, for instance:
+
+```python
+import spacy
+
+nlp = spacy.blank("eds")
+nlp.add_pipe("eds.normalizer")
+nlp.add_pipe("eds.sentences")
+nlp.add_pipe("eds.tnm")
+```
+
+## Basic architecture
+
+Most components provided by EDS-NLP aim to qualify pre-extracted entities. To wit, the basic usage of the library:
+
+1. Implement a normaliser (see [`normalizer`](./core/normalizer.md))
+2. Add an entity recognition component (eg the simple but powerful [`matcher` component](./core/matcher.md))
+3. Add zero or more entity qualification components, such as [`negation`](./qualifiers/negation.md), [`family`](./qualifiers/family.md) or [`hypothesis`](./qualifiers/hypothesis.md). These qualifiers typically help detect false-positives.
+
+## Extraction components
+
+Extraction components (matchers, the date detector or NER components, for instance) keep their results to the `doc.ents` and `doc.spans` attributes directly.
+
+By default, some components do not write their output to `doc.ents`, such as the `eds.sections` matcher. This is mainly due to the fact that, since `doc.ents` cannot contain overlapping entities, we [filter spans][edsnlp.utils.filter.filter_spans] and keep the largest one by default. Since sections usually cover large spans of text, storing them in ents would remove every other overlapping entities.
+
+## Entity tagging
+
+Moreover, most components declare [extensions](https://spacy.io/usage/processing-components#custom-components-attributes), on the `Doc`, `Span` and/or `Token` objects.
+
+These extensions are especially useful for qualifier components, but can also be used by other components to persist relevant information. For instance, the `eds.dates` component declares a `span._.date` extension to store a normalised version of each detected date.
diff --git a/docs/pipelines/qualifiers/family.md b/docs/pipelines/qualifiers/family.md
index dceeabe26..7b125231c 100644
--- a/docs/pipelines/qualifiers/family.md
+++ b/docs/pipelines/qualifiers/family.md
@@ -1,60 +1,8 @@
-# Family
-
-The `eds.family` pipeline uses a simple rule-based algorithm to detect spans that describe a family member (or family history) of the patient rather than the patient themself.
-
-## Usage
-
-The following snippet matches a simple terminology, and checks the family context of the extracted entities. It is complete, and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-# Dummy matcher
-nlp.add_pipe(
- "eds.matcher",
- config=dict(terms=dict(douleur="douleur", osteoporose="ostéoporose")),
-)
-nlp.add_pipe("eds.family")
-
-text = (
- "Le patient est admis le 23 août 2021 pour une douleur au bras. "
- "Il a des antécédents familiaux d'ostéoporose"
-)
-
-doc = nlp(text)
-
-doc.ents
-# Out: (douleur, ostéoporose)
-
-doc.ents[0]._.family
-# Out: False
-
-doc.ents[1]._.family
-# Out: True
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------- | ------------------------------------------------------------------------ | --------------------------------- |
-| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` |
-| `family` | Family patterns | `None` (use pre-defined patterns) |
-| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) |
-| `use_sections` | Whether to use pre-annotated sections (requires the `sections` pipeline) | `False` |
-| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` |
-| `explain` | Whether to keep track of the cues for each entity | `False` |
-
-## Declared extensions
-
-The `eds.family` pipeline declares two [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects :
-
-1. The `family` attribute is a boolean, set to `True` if the pipeline predicts that the span/token relates to a family member.
-2. The `family_` property is a human-readable string, computed from the `family` attribute. It implements a simple getter function that outputs `PATIENT` or `FAMILY`, depending on the value of `family`.
-
-## Authors and citation
-
-The `eds.family` pipeline was developed by AP-HP's Data Science team.
+# Family Context {: #edsnlp.pipelines.qualifiers.family.factory.create_component }
+
+::: edsnlp.pipelines.qualifiers.family.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/qualifiers/history.md b/docs/pipelines/qualifiers/history.md
index 00cce32e7..ef755e385 100644
--- a/docs/pipelines/qualifiers/history.md
+++ b/docs/pipelines/qualifiers/history.md
@@ -1,105 +1,8 @@
-# Medical History
-
-The `eds.history` pipeline uses a simple rule-based algorithm to detect spans that describe medical history rather than the diagnostic of a given visit.
-
-The mere definition of an medical history is not straightforward.
-Hence, this component only tags entities that are _explicitly described as part of the medical history_,
-eg preceded by a synonym of "medical history".
-
-This component may also use the output of:
-
-- the [`eds.sections` pipeline](../misc/sections.md). In that case, the entire `antécédent` section is tagged as a medical history.
-
-!!! warning "Sections"
-
- Be careful, the `eds.sections` component may oversize the `antécédents` section. Indeed, it detects *section titles*
- and tags the entire text between a title and the next as a section. Hence, should a section title goes undetected after
- the `antécédents` title, some parts of the document will erroneously be tagged as a medical history.
-
- To curb that possibility, using the output of the `eds.sections` component is deactivated by default.
-
-- the [`eds.dates` pipeline](../misc/dates.md). In that case, it will take the dates into account to tag extracted entities as a medical history or not.
-
-!!! info "Dates"
-
- To take the most of the `eds.dates` component, you may add the ``note_datetime`` context (cf. [Adding context][using-eds-nlps-helper-functions]). It allows the pipeline to compute the duration of absolute dates (eg le 28 août 2022/August 28, 2022). The ``birth_datetime`` context allows the pipeline to exclude the birth date from the extracted dates.
-
-## Usage
-
-The following snippet matches a simple terminology, and checks whether the extracted entities are history or not. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-nlp.add_pipe("eds.normalizer")
-nlp.add_pipe("eds.sections")
-nlp.add_pipe("eds.dates")
-nlp.add_pipe(
- "eds.matcher",
- config=dict(terms=dict(douleur="douleur", malaise="malaises")),
-)
-nlp.add_pipe(
- "eds.history",
- config=dict(
- use_sections=True,
- use_dates=True,
- ),
-)
-
-text = (
- "Le patient est admis le 23 août 2021 pour une douleur au bras. "
- "Il a des antécédents de malaises."
- "ANTÉCÉDENTS : "
- "- le patient a déjà eu des malaises. "
- "- le patient a eu une douleur à la jambe il y a 10 jours"
-)
-
-doc = nlp(text)
-
-doc.ents
-# Out: (douleur, malaises, malaises, douleur)
-
-doc.ents[0]._.history
-# Out: False
-
-doc.ents[1]._.history
-# Out: True
-
-doc.ents[2]._.history # (1)
-# Out: True
-
-doc.ents[3]._.history # (2)
-# Out: False
-```
-
-1. The entity is in the section `antécédent`.
-2. The entity is in the section `antécédent`, however the extracted `relative_date` refers to an event that took place within 14 days.
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------------- | -------------------------------------------------------------------------------------------------------------------- | --------------------------------- |
-| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` |
-| `history` | History patterns | `None` (use pre-defined patterns) |
-| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) |
-| `use_sections` | Whether to use pre-annotated sections (requires the `sections` pipeline) | `False` |
-| `use_dates` | Whether to use dates pipeline (requires the `dates` pipeline and ``note_datetime`` context is recommended) | `False` |
-| `history_limit` | If `use_dates = True`. The number of days after which the event is considered as history. | `14` (2 weeks) |
-| `exclude_birthdate` | If `use_dates = True`. Whether to exclude the birth date from history dates. | `True` |
-| `closest_dates_only` | If `use_dates = True`. Whether to include the closest dates only. If `False`, it includes all dates in the sentence. | `True` |
-| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` |
-| `explain` | Whether to keep track of the cues for each entity | `False` |
-
-## Declared extensions
-
-The `eds.history` pipeline declares two [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects :
-
-1. The `history` attribute is a boolean, set to `True` if the pipeline predicts that the span/token is a medical history.
-2. The `history_` property is a human-readable string, computed from the `history` attribute. It implements a simple getter function that outputs `CURRENT` or `ATCD`, depending on the value of `history`.
-
-## Authors and citation
-
-The `eds.history` pipeline was developed by AP-HP's Data Science team.
+# Medical History {: #edsnlp.pipelines.qualifiers.history.factory.create_component }
+
+::: edsnlp.pipelines.qualifiers.history.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/qualifiers/hypothesis.md b/docs/pipelines/qualifiers/hypothesis.md
index 7e0df953c..abe631f43 100644
--- a/docs/pipelines/qualifiers/hypothesis.md
+++ b/docs/pipelines/qualifiers/hypothesis.md
@@ -1,84 +1,8 @@
-# Hypothesis
-
-The `eds.hypothesis` pipeline uses a simple rule-based algorithm to detect spans that are speculations rather than certain statements.
-
-## Usage
-
-The following snippet matches a simple terminology, and checks whether the extracted entities are part of a speculation. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-# Dummy matcher
-nlp.add_pipe(
- "eds.matcher",
- config=dict(terms=dict(douleur="douleur", fracture="fracture")),
-)
-nlp.add_pipe("eds.hypothesis")
-
-text = (
- "Le patient est admis le 23 août 2021 pour une douleur au bras. "
- "Possible fracture du radius."
-)
-
-doc = nlp(text)
-
-doc.ents
-# Out: (douleur, fracture)
-
-doc.ents[0]._.hypothesis
-# Out: False
-
-doc.ents[1]._.hypothesis
-# Out: True
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------- | ---------------------------------------------------------- | --------------------------------- |
-| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` |
-| `pseudo` | Pseudo-hypothesis patterns | `None` (use pre-defined patterns) |
-| `preceding` | Preceding hypothesis patterns | `None` (use pre-defined patterns) |
-| `following` | Following hypothesis patterns | `None` (use pre-defined patterns) |
-| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) |
-| `verbs_hyp` | Patterns for verbs that imply a hypothesis | `None` (use pre-defined patterns) |
-| `verbs_eds` | Common verb patterns, checked for conditional mode | `None` (use pre-defined patterns) |
-| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` |
-| `within_ents` | Whether to look for hypothesis within entities | `False` |
-| `explain` | Whether to keep track of the cues for each entity | `False` |
-
-## Declared extensions
-
-The `eds.hypothesis` pipeline declares two [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects :
-
-1. The `hypothesis` attribute is a boolean, set to `True` if the pipeline predicts that the span/token is a speculation.
-2. The `hypothesis_` property is a human-readable string, computed from the `hypothesis` attribute. It implements a simple getter function that outputs `HYP` or `CERT`, depending on the value of `hypothesis`.
-
-## Performance
-
-The pipeline's performance is measured on three datasets :
-
-- The ESSAI[@dalloux2017ESSAI] and CAS[@grabar2018CAS] datasets were developed at the CNRS. The two are concatenated.
-- The NegParHyp corpus was specifically developed at EDS to test the pipeline on actual clinical notes, using pseudonymised notes from the EDS.
-
-| Dataset | Hypothesis F1 |
-| --------- | ------------- |
-| CAS/ESSAI | 49% |
-| NegParHyp | 52% |
-
-!!! note "NegParHyp corpus"
-
- The NegParHyp corpus was built by matching a subset of the MeSH terminology with around 300 documents
- from AP-HP's clinical data warehouse.
- Matched entities were then labelled for negation, speculation and family context.
-
-## Authors and citation
-
-The `eds.hypothesis` pipeline was developed by AP-HP's Data Science team.
-
-\bibliography
+# Hypothesis {: #edsnlp.pipelines.qualifiers.hypothesis.factory.create_component }
+
+::: edsnlp.pipelines.qualifiers.hypothesis.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/qualifiers/negation.md b/docs/pipelines/qualifiers/negation.md
index 98e17f84a..4d1eb4dff 100644
--- a/docs/pipelines/qualifiers/negation.md
+++ b/docs/pipelines/qualifiers/negation.md
@@ -1,85 +1,8 @@
-# Negation
-
-The `eds.negation` pipeline uses a simple rule-based algorithm to detect negated spans. It was designed at AP-HP's EDS, following the insights of the NegEx algorithm by Chapman et al[@chapman_simple_2001].
-
-## Usage
-
-The following snippet matches a simple terminology, and checks the polarity of the extracted entities. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-# Dummy matcher
-nlp.add_pipe(
- "eds.matcher",
- config=dict(terms=dict(patient="patient", fracture="fracture")),
-)
-nlp.add_pipe("eds.negation")
-
-text = (
- "Le patient est admis le 23 août 2021 pour une douleur au bras. "
- "Le scanner ne détecte aucune fracture."
-)
-
-doc = nlp(text)
-
-doc.ents
-# Out: (patient, fracture)
-
-doc.ents[0]._.negation # (1)
-# Out: False
-
-doc.ents[1]._.negation
-# Out: True
-```
-
-1. The result of the pipeline is kept in the `negation` custom extension.
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------- | ---------------------------------------------------------- | --------------------------------- |
-| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` |
-| `pseudo` | Pseudo-negation patterns | `None` (use pre-defined patterns) |
-| `preceding` | Preceding negation patterns | `None` (use pre-defined patterns) |
-| `following` | Following negation patterns | `None` (use pre-defined patterns) |
-| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) |
-| `verbs` | Patterns for verbs that imply a negation | `None` (use pre-defined patterns) |
-| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` |
-| `within_ents` | Whether to look for negations within entities | `False` |
-| `explain` | Whether to keep track of the cues for each entity | `False` |
-
-## Declared extensions
-
-The `eds.negation` pipeline declares two [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects :
-
-1. The `negation` attribute is a boolean, set to `True` if the pipeline predicts that the span/token is negated.
-2. The `negation_` property is a human-readable string, computed from the `negation` attribute. It implements a simple getter function that outputs `AFF` or `NEG`, depending on the value of `negation`.
-
-## Performance
-
-The pipeline's performance is measured on three datasets :
-
-- The ESSAI[@dalloux2017ESSAI] and CAS[@grabar2018CAS] datasets were developed at the CNRS. The two are concatenated.
-- The NegParHyp corpus was specifically developed at AP-HP to test the pipeline on actual clinical notes, using pseudonymised notes from the AP-HP.
-
-| Dataset | Negation F1 |
-| --------- | ----------- |
-| CAS/ESSAI | 71% |
-| NegParHyp | 88% |
-
-!!! note "NegParHyp corpus"
-
- The NegParHyp corpus was built by matching a subset of the MeSH terminology with around 300 documents
- from AP-HP's clinical data warehouse.
- Matched entities were then labelled for negation, speculation and family context.
-
-## Authors and citation
-
-The `eds.negation` pipeline was developed by AP-HP's Data Science team.
-
-\bibliography
+# Negation {: #edsnlp.pipelines.qualifiers.negation.factory.create_component }
+
+::: edsnlp.pipelines.qualifiers.negation.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/qualifiers/index.md b/docs/pipelines/qualifiers/overview.md
similarity index 68%
rename from docs/pipelines/qualifiers/index.md
rename to docs/pipelines/qualifiers/overview.md
index a4e2552f8..936fdc58d 100644
--- a/docs/pipelines/qualifiers/index.md
+++ b/docs/pipelines/qualifiers/overview.md
@@ -1,16 +1,21 @@
-# Qualifier overview
+# Qualifier Overview
-In EDS-NLP, we call _qualifiers_ the suite of pipelines designed to _qualify_ a pre-extracted entity for a linguistic modality.
+In EDS-NLP, we call _qualifiers_ the suite of components designed to _qualify_ a
+pre-extracted entity for a linguistic modality.
-## Available pipelines
+## Available components
-| Name | Description |
-| --------------------- | -------------------------------------------------------------------- |
-| `eds.negation` | Detect negated entities |
-| `eds.family` | Detect entities that pertain to a patient's kin rather than themself |
-| `eds.hypothesis` | Detect entities subject to speculation |
-| `eds.reported_speech` | Detect entities that are quoted from the patient |
-| `eds.history` | Detect entities that pertain to the patient's history |
+
+
+| Pipeline | Description |
+|----------------------------------------------------------------|--------------------------------------|
+| [`eds.negation`](/pipelines/qualifiers/negation) | Rule-based negation detection |
+| [`eds.family`](/pipelines/qualifiers/family) | Rule-based family context detection |
+| [`eds.hypothesis`](/pipelines/qualifiers/hypothesis) | Rule-based speculation detection |
+| [`eds.reported_speech`](/pipelines/qualifiers/reported-speech) | Rule-based reported speech detection |
+| [`eds.history`](/pipelines/qualifiers/history) | Rule-based medical history detection |
+
+
## Rationale
@@ -43,6 +48,17 @@ There is an obvious problem: none of these examples should lead us to include th
To curb this issue, EDS-NLP proposes rule-based pipelines that qualify entities to help the user make an informed decision about which patient should be included in a real-world data cohort.
+## Which spans are qualified ? {: #edsnlp.pipelines.base.SpanGetterArg }
+
+A component get entities from a document by looking up `doc.ents` or `doc.spans[group]`. This behavior is set by the `span_getter` argument in components that support it.
+
+::: edsnlp.pipelines.base.SpanGetterArg
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
+
## Under the hood
Our _qualifier_ pipelines all follow the same basic pattern:
@@ -71,14 +87,12 @@ Albeit simple, this algorithm can achieve very good performance depending on the
You may use EDS-NLP's:
-
-
- ```python
+ ```{ .python .no-check }
nlp.add_pipe("eds.sentences")
```
## Persisting the results
-Our qualifier pipelines write their results to a custom [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes){target=_blank}, defined on both `Span` and `Token` objects. We follow the convention of naming said attribute after the pipeline itself, eg `Span._.negation`for the`eds.negation` pipeline. In most cases, that extension is a boolean.
+Our qualifier pipelines write their results to a custom [spaCy extension](https://spacy.io/usage/processing-pipelines#custom-components-attributes){target=_blank}, defined on both `Span` and `Token` objects. We follow the convention of naming said attribute after the pipeline itself, eg `Span._.negation` for the`eds.negation` pipeline.
We also provide a string representation of the result, computed on the fly by declaring a getter that reads the boolean result of the pipeline. Following spaCy convention, we give this attribute the same name, followed by a `_`.
diff --git a/docs/pipelines/qualifiers/reported-speech.md b/docs/pipelines/qualifiers/reported-speech.md
index d5a83fc89..9d4ebd2cb 100644
--- a/docs/pipelines/qualifiers/reported-speech.md
+++ b/docs/pipelines/qualifiers/reported-speech.md
@@ -1,64 +1,8 @@
-# Reported Speech
-
-The `eds.reported_speech` pipeline uses a simple rule-based algorithm to detect spans that relate to reported speech (eg when the doctor quotes the patient).
-It was designed at AP-HP's EDS.
-
-## Usage
-
-The following snippet matches a simple terminology, and checks whether the extracted entities are part of a reported speech. It is complete and can be run _as is_.
-
-```python
-import spacy
-
-nlp = spacy.blank("fr")
-nlp.add_pipe("eds.sentences")
-# Dummy matcher
-nlp.add_pipe(
- "eds.matcher",
- config=dict(terms=dict(patient="patient", alcool="alcoolisé")),
-)
-nlp.add_pipe("eds.reported_speech")
-
-text = (
- "Le patient est admis aux urgences ce soir pour une douleur au bras. "
- "Il nie être alcoolisé."
-)
-
-doc = nlp(text)
-
-doc.ents
-# Out: (patient, alcoolisé)
-
-doc.ents[0]._.reported_speech
-# Out: False
-
-doc.ents[1]._.reported_speech
-# Out: True
-```
-
-## Configuration
-
-The pipeline can be configured using the following parameters :
-
-| Parameter | Explanation | Default |
-| -------------- | ---------------------------------------------------------- | --------------------------------- |
-| `attr` | spaCy attribute to match on (eg `NORM`, `TEXT`, `LOWER`) | `"NORM"` |
-| `pseudo` | Pseudo-reported speech patterns | `None` (use pre-defined patterns) |
-| `preceding` | Preceding reported speech patterns | `None` (use pre-defined patterns) |
-| `following` | Following reported speech patterns | `None` (use pre-defined patterns) |
-| `termination` | Termination patterns (for syntagma/proposition extraction) | `None` (use pre-defined patterns) |
-| `verbs` | Patterns for verbs that imply a reported speech | `None` (use pre-defined patterns) |
-| `on_ents_only` | Whether to qualify pre-extracted entities only | `True` |
-| `within_ents` | Whether to look for reported speech within entities | `False` |
-| `explain` | Whether to keep track of the cues for each entity | `False` |
-
-## Declared extensions
-
-The `eds.reported_speech` pipeline declares two [spaCy extensions](https://spacy.io/usage/processing-pipelines#custom-components-attributes), on both `Span` and `Token` objects :
-
-1. The `reported_speech` attribute is a boolean, set to `True` if the pipeline predicts that the span/token is reported.
-2. The `reported_speech_` property is a human-readable string, computed from the `reported_speech` attribute. It implements a simple getter function that outputs `DIRECT` or `REPORTED`, depending on the value of `reported_speech`.
-
-## Authors and citation
-
-The `eds.reported_speech` pipeline was developed by AP-HP's Data Science team.
+# Reported Speech {: #edsnlp.pipelines.qualifiers.reported_speech.factory.create_component }
+
+::: edsnlp.pipelines.qualifiers.reported_speech.factory.create_component
+ options:
+ heading_level: 2
+ show_bases: false
+ show_source: false
+ only_class_level: true
diff --git a/docs/pipelines/trainable/index.md b/docs/pipelines/trainable/index.md
index cddfec749..bfe5d7e38 100644
--- a/docs/pipelines/trainable/index.md
+++ b/docs/pipelines/trainable/index.md
@@ -27,8 +27,7 @@ In addition to the spaCy `train` CLI, EDS-NLP offers a `train` function that can
Let us define and train a full pipeline :
-
-```python
+```{ .python .no-check }
from pathlib import Path
import spacy
diff --git a/docs/pipelines/trainable/ner.md b/docs/pipelines/trainable/ner.md
index a1df70851..9929378ae 100644
--- a/docs/pipelines/trainable/ner.md
+++ b/docs/pipelines/trainable/ner.md
@@ -37,8 +37,7 @@ CRF (Conditional Random Fields) layers, one per label during both training and p
Let us define the pipeline and train it:
-
-```python
+```{ .python .no-check }
from pathlib import Path
import spacy
diff --git a/docs/pipelines/trainable/span-qualifier.md b/docs/pipelines/trainable/span-qualifier.md
index f0a030561..2af1beeb8 100644
--- a/docs/pipelines/trainable/span-qualifier.md
+++ b/docs/pipelines/trainable/span-qualifier.md
@@ -54,9 +54,7 @@ Let us define the pipeline and train it. We provide utils to train the model usi
=== "API-based (Light)"
-
-
- ```python
+ ```{ .python .no-check }
from pathlib import Path
import spacy
@@ -272,9 +270,7 @@ Let us define the pipeline and train it. We provide utils to train the model usi
To use it, load the model and process a text :
-
-
- ```python
+ ```{ .python .no-check }
import spacy
nlp = spacy.load("training/model-best")
@@ -431,9 +427,7 @@ Let us define the pipeline and train it. We provide utils to train the model usi
To use it, load the model and process a text :
-
-
- ```python
+ ```{ .python .no-check }
import spacy
nlp = spacy.load("training/model-best")
diff --git a/docs/scripts/autorefs/LICENSE b/docs/scripts/autorefs/LICENSE
new file mode 100644
index 000000000..15b59d08a
--- /dev/null
+++ b/docs/scripts/autorefs/LICENSE
@@ -0,0 +1,16 @@
+ISC License
+
+Copyright (c) 2019, Oleh Prypin
+Copyright (c) 2019, Timothée Mazzucotelli
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/docs/scripts/autorefs/plugin.py b/docs/scripts/autorefs/plugin.py
new file mode 100644
index 000000000..c4ac6a3a0
--- /dev/null
+++ b/docs/scripts/autorefs/plugin.py
@@ -0,0 +1,473 @@
+# ruff: noqa: E501
+"""
+# -----------
+VENDORED https://github.com/mkdocstrings/autorefs/blob/e19b9fa47dac136a529c2be0d7969106ca5d5106/src/mkdocs_autorefs/
+Waiting for the following PR to be merged: https://github.com/mkdocstrings/autorefs/pull/25
+# -----------
+
+This module contains the "mkdocs-autorefs" plugin.
+
+After each page is processed by the Markdown converter, this plugin stores absolute URLs of every HTML anchors
+it finds to later be able to fix unresolved references.
+It stores them during the [`on_page_content` event hook](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
+
+Just before writing the final HTML to the disc, during the
+[`on_post_page` event hook](https://www.mkdocs.org/user-guide/plugins/#on_post_page),
+this plugin searches for references of the form `[identifier][]` or `[title][identifier]` that were not resolved,
+and fixes them using the previously stored identifier-URL mapping.
+"""
+
+import contextlib
+import functools
+import logging
+import re
+from html import escape, unescape
+from typing import Any, Callable, Dict, List, Match, Optional, Sequence, Tuple, Union
+from urllib.parse import urlsplit
+from xml.etree.ElementTree import Element
+
+import pathspec
+from markdown import Markdown
+from markdown.extensions import Extension
+from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor
+from markdown.util import INLINE_PLACEHOLDER_RE
+from mkdocs.config import Config
+from mkdocs.config import config_options as c
+from mkdocs.plugins import BasePlugin
+from mkdocs.structure.pages import Page
+from mkdocs.structure.toc import AnchorLink
+from mkdocs.utils import warning_filter
+
+AUTO_REF_RE = re.compile(
+ r"autorefs-identifier|autorefs-optional|autorefs-optional-hover)="
+ r'("?)(?P[^"<>]*)\2>(?P.*?)'
+)
+"""A regular expression to match mkdocs-autorefs' special reference markers
+in the [`on_post_page` hook][mkdocs_autorefs.plugin.AutorefsPlugin.on_post_page].
+"""
+
+EvalIDType = Tuple[Any, Any, Any]
+
+
+class AutoRefInlineProcessor(ReferenceInlineProcessor):
+ """A Markdown extension."""
+
+ def __init__(self, *args, **kwargs): # noqa: D107
+ super().__init__(REFERENCE_RE, *args, **kwargs)
+
+ # Code based on
+ # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780
+
+ def handleMatch(self, m, data) -> Union[Element, EvalIDType]: # type: ignore[override] # noqa: N802,WPS111
+ """Handle an element that matched.
+
+ Arguments:
+ m: The match object.
+ data: The matched data.
+
+ Returns:
+ A new element or a tuple.
+ """
+ text, index, handled = self.getText(data, m.end(0))
+ if not handled:
+ return None, None, None
+
+ identifier, end, handled = self.evalId(data, index, text)
+ if not handled:
+ return None, None, None
+
+ if re.search(r"[/ \x00-\x1f]", identifier):
+ # Do nothing if the matched reference contains:
+ # - a space, slash or control character (considered unintended);
+ # - specifically \x01 is used by Python-Markdown HTML stash when there's inline formatting,
+ # but references with Markdown formatting are not possible anyway.
+ return None, m.start(0), end
+
+ return self.makeTag(identifier, text), m.start(0), end
+
+ def evalId(
+ self, data: str, index: int, text: str
+ ) -> EvalIDType: # noqa: N802 (parent's casing)
+ """Evaluate the id portion of `[ref][id]`.
+
+ If `[ref][]` use `[ref]`.
+
+ Arguments:
+ data: The data to evaluate.
+ index: The starting position.
+ text: The text to use when no identifier.
+
+ Returns:
+ A tuple containing the identifier, its end position, and whether it matched.
+ """
+ m = self.RE_LINK.match(data, pos=index) # noqa: WPS111
+ if not m:
+ return None, index, False
+
+ identifier = m.group(1)
+ if not identifier:
+ identifier = text
+ # Allow the entire content to be one placeholder, with the intent of catching things like [`Foo`][].
+ # It doesn't catch [*Foo*][] though, just due to the priority order.
+ # https://github.com/Python-Markdown/markdown/blob/1858c1b601ead62ed49646ae0d99298f41b1a271/markdown/inlinepatterns.py#L78
+ if INLINE_PLACEHOLDER_RE.fullmatch(identifier):
+ identifier = self.unescape(identifier)
+
+ end = m.end(0)
+ return identifier, end, True
+
+ def makeTag(self, identifier: str, text: str) -> Element: # type: ignore[override] # noqa: N802,W0221
+ """Create a tag that can be matched by `AUTO_REF_RE`.
+
+ Arguments:
+ identifier: The identifier to use in the HTML property.
+ text: The text to use in the HTML tag.
+
+ Returns:
+ A new element.
+ """
+ el = Element("span")
+ el.set("data-autorefs-identifier", identifier)
+ el.text = text
+ return el
+
+
+def relative_url(url_a: str, url_b: str) -> str:
+ """Compute the relative path from URL A to URL B.
+
+ Arguments:
+ url_a: URL A.
+ url_b: URL B.
+
+ Returns:
+ The relative URL to go from A to B.
+ """
+ parts_a = url_a.split("/")
+ url_b, anchor = url_b.split("#", 1)
+ parts_b = url_b.split("/")
+
+ # remove common left parts
+ while parts_a and parts_b and parts_a[0] == parts_b[0]:
+ parts_a.pop(0)
+ parts_b.pop(0)
+
+ # go up as many times as remaining a parts' depth
+ levels = len(parts_a) - 1
+ parts_relative = [".."] * levels + parts_b # noqa: WPS435
+ relative = "/".join(parts_relative)
+ return f"{relative}#{anchor}"
+
+
+def fix_ref(
+ url_mapper: Callable[[str], str], unmapped: List[str]
+) -> Callable: # noqa: WPS212,WPS231
+ """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub).
+
+ In our context, we match Markdown references and replace them with HTML links.
+
+ When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
+ `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
+ and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
+
+ Arguments:
+ url_mapper: A callable that gets an object's site URL by its identifier,
+ such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
+ unmapped: A list to store unmapped identifiers.
+
+ Returns:
+ The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects)
+ and returning the replacement strings.
+ """
+
+ def inner(match: Match): # noqa: WPS212,WPS430
+ identifier = match["identifier"]
+ title = match["title"]
+ kind = match["kind"]
+
+ try:
+ url = url_mapper(unescape(identifier))
+ except KeyError:
+ if kind == "autorefs-optional":
+ return title
+ elif kind == "autorefs-optional-hover":
+ return f'{title}'
+ unmapped.append(identifier)
+ if title == identifier:
+ return f"[{identifier}][]"
+ return f"[{title}][{identifier}]"
+
+ parsed = urlsplit(url)
+ external = parsed.scheme or parsed.netloc
+ classes = ["autorefs", "autorefs-external" if external else "autorefs-internal"]
+ class_attr = " ".join(classes)
+ if kind == "autorefs-optional-hover":
+ return f'{title}'
+ return f'{title}'
+
+ return inner
+
+
+def fix_refs(html: str, url_mapper: Callable[[str], str]) -> Tuple[str, List[str]]:
+ """Fix all references in the given HTML text.
+
+ Arguments:
+ html: The text to fix.
+ url_mapper: A callable that gets an object's site URL by its identifier,
+ such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
+
+ Returns:
+ The fixed HTML.
+ """
+ unmapped = [] # type: ignore
+ html = AUTO_REF_RE.sub(fix_ref(url_mapper, unmapped), html)
+ return html, unmapped
+
+
+class AutorefsExtension(Extension):
+ """Extension that inserts auto-references in Markdown."""
+
+ def extendMarkdown(
+ self, md: Markdown
+ ) -> None: # noqa: N802 (casing: parent method's name)
+ """Register the extension.
+
+ Add an instance of our [`AutoRefInlineProcessor`][mkdocs_autorefs.references.AutoRefInlineProcessor] to the Markdown parser.
+
+ Arguments:
+ md: A `markdown.Markdown` instance.
+ """
+ md.inlinePatterns.register(
+ AutoRefInlineProcessor(md),
+ "mkdocs-autorefs",
+ priority=168, # noqa: WPS432 # Right after markdown.inlinepatterns.ReferenceInlineProcessor
+ )
+
+
+log = logging.getLogger(f"mkdocs.plugins.{__name__}")
+log.addFilter(warning_filter)
+
+
+class AutorefsPlugin(BasePlugin):
+ """An `mkdocs` plugin.
+
+ This plugin defines the following event hooks:
+
+ - `on_config`
+ - `on_page_content`
+ - `on_post_page`
+
+ Check the [Developing Plugins](https://www.mkdocs.org/user-guide/plugins/#developing-plugins) page of `mkdocs`
+ for more information about its plugin system.
+ """
+
+ scan_toc: bool = True
+ current_page: Optional[str] = None
+ config_scheme = (("priority", c.ListOfItems(c.Type(str), default=[])),)
+
+ def __init__(self) -> None:
+ """Initialize the object."""
+ super().__init__()
+ self._url_map: Dict[str, str] = {}
+ self._abs_url_map: Dict[str, str] = {}
+ self.get_fallback_anchor: Optional[
+ Callable[[str], Optional[str]]
+ ] = None # noqa: WPS234
+ self._priority_patterns = None
+
+ @property
+ def priority_patterns(self):
+ if self._priority_patterns is None:
+ self._priority_patterns = [
+ pathspec.patterns.GitWildMatchPattern(pat)
+ for pat in self.config.get("priority")
+ ]
+ return self._priority_patterns
+
+ def register_anchor(self, page: str, identifier: str):
+ """Register that an anchor corresponding to an identifier was encountered when rendering the page.
+
+ Arguments:
+ page: The relative URL of the current page. Examples: `'foo/bar/'`, `'foo/index.html'`
+ identifier: The HTML anchor (without '#') as a string.
+ """
+ if identifier in self._url_map:
+ rev_patterns = list(enumerate(self.priority_patterns))[::-1]
+ old_priority_idx = next(
+ (
+ i
+ for i, pat in rev_patterns
+ if pat.match_file(self._url_map[identifier])
+ ),
+ len(rev_patterns),
+ )
+ new_priority_idx = next(
+ (i for i, pat in rev_patterns if pat.match_file(page)),
+ len(rev_patterns),
+ )
+ if new_priority_idx >= old_priority_idx:
+ return
+ self._url_map[identifier] = f"{page}#{identifier}"
+
+ def register_url(self, identifier: str, url: str):
+ """Register that the identifier should be turned into a link to this URL.
+
+ Arguments:
+ identifier: The new identifier.
+ url: The absolute URL (including anchor, if needed) where this item can be found.
+ """
+ self._abs_url_map[identifier] = url
+
+ def _get_item_url( # noqa: WPS234
+ self,
+ identifier: str,
+ fallback: Optional[Callable[[str], Sequence[str]]] = None,
+ ) -> str:
+ try:
+ return self._url_map[identifier]
+ except KeyError:
+ if identifier in self._abs_url_map:
+ return self._abs_url_map[identifier]
+ if fallback:
+ new_identifiers = fallback(identifier)
+ for new_identifier in new_identifiers:
+ with contextlib.suppress(KeyError):
+ url = self._get_item_url(new_identifier)
+ self._url_map[identifier] = url
+ return url
+ raise
+
+ def get_item_url( # noqa: WPS234
+ self,
+ identifier: str,
+ from_url: Optional[str] = None,
+ fallback: Optional[Callable[[str], Sequence[str]]] = None,
+ ) -> str:
+ """Return a site-relative URL with anchor to the identifier, if it's present anywhere.
+
+ Arguments:
+ identifier: The anchor (without '#').
+ from_url: The URL of the base page, from which we link towards the targeted pages.
+ fallback: An optional function to suggest alternative anchors to try on failure.
+
+ Returns:
+ A site-relative URL.
+ """
+ url = self._get_item_url(identifier, fallback)
+ if from_url is not None:
+ parsed = urlsplit(url)
+ if not parsed.scheme and not parsed.netloc:
+ return relative_url(from_url, url)
+ return url
+
+ def on_config(
+ self, config: Config, **kwargs
+ ) -> Config: # noqa: W0613,R0201 (unused arguments, cannot be static)
+ """Instantiate our Markdown extension.
+
+ Hook for the [`on_config` event](https://www.mkdocs.org/user-guide/plugins/#on_config).
+ In this hook, we instantiate our [`AutorefsExtension`][mkdocs_autorefs.references.AutorefsExtension]
+ and add it to the list of Markdown extensions used by `mkdocs`.
+
+ Arguments:
+ config: The MkDocs config object.
+ kwargs: Additional arguments passed by MkDocs.
+
+ Returns:
+ The modified config.
+ """
+ log.debug(f"{__name__}: Adding AutorefsExtension to the list")
+ config["markdown_extensions"].append(AutorefsExtension())
+ return config
+
+ def on_page_markdown(
+ self, markdown: str, page: Page, **kwargs
+ ) -> str: # noqa: W0613 (unused arguments)
+ """Remember which page is the current one.
+
+ Arguments:
+ markdown: Input Markdown.
+ page: The related MkDocs page instance.
+ kwargs: Additional arguments passed by MkDocs.
+
+ Returns:
+ The same Markdown. We only use this hook to map anchors to URLs.
+ """
+ self.current_page = page.url # noqa: WPS601
+ return markdown
+
+ def on_page_content(
+ self, html: str, page: Page, **kwargs
+ ) -> str: # noqa: W0613 (unused arguments)
+ """Map anchors to URLs.
+
+ Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
+ In this hook, we map the IDs of every anchor found in the table of contents to the anchors absolute URLs.
+ This mapping will be used later to fix unresolved reference of the form `[title][identifier]` or
+ `[identifier][]`.
+
+ Arguments:
+ html: HTML converted from Markdown.
+ page: The related MkDocs page instance.
+ kwargs: Additional arguments passed by MkDocs.
+
+ Returns:
+ The same HTML. We only use this hook to map anchors to URLs.
+ """
+ if self.scan_toc:
+ log.debug(
+ f"{__name__}: Mapping identifiers to URLs for page {page.file.src_path}"
+ )
+ for item in page.toc.items:
+ self.map_urls(page.url, item)
+ return html
+
+ def map_urls(self, base_url: str, anchor: AnchorLink) -> None:
+ """Recurse on every anchor to map its ID to its absolute URL.
+
+ This method populates `self.url_map` by side-effect.
+
+ Arguments:
+ base_url: The base URL to use as a prefix for each anchor's relative URL.
+ anchor: The anchor to process and to recurse on.
+ """
+ self.register_anchor(base_url, anchor.id)
+ for child in anchor.children:
+ self.map_urls(base_url, child)
+
+ def on_post_page(
+ self, output: str, page: Page, **kwargs
+ ) -> str: # noqa: W0613 (unused arguments)
+ """Fix cross-references.
+
+ Hook for the [`on_post_page` event](https://www.mkdocs.org/user-guide/plugins/#on_post_page).
+ In this hook, we try to fix unresolved references of the form `[title][identifier]` or `[identifier][]`.
+ Doing that allows the user of `autorefs` to cross-reference objects in their documentation strings.
+ It uses the native Markdown syntax so it's easy to remember and use.
+
+ We log a warning for each reference that we couldn't map to an URL, but try to be smart and ignore identifiers
+ that do not look legitimate (sometimes documentation can contain strings matching
+ our [`AUTO_REF_RE`][mkdocs_autorefs.references.AUTO_REF_RE] regular expression that did not intend to reference anything).
+ We currently ignore references when their identifier contains a space or a slash.
+
+ Arguments:
+ output: HTML converted from Markdown.
+ page: The related MkDocs page instance.
+ kwargs: Additional arguments passed by MkDocs.
+
+ Returns:
+ Modified HTML.
+ """
+ log.debug(f"{__name__}: Fixing references in page {page.file.src_path}")
+
+ url_mapper = functools.partial(
+ self.get_item_url, from_url=page.url, fallback=self.get_fallback_anchor
+ )
+ fixed_output, unmapped = fix_refs(output, url_mapper)
+
+ if unmapped and log.isEnabledFor(logging.WARNING):
+ for ref in unmapped:
+ log.warning(
+ f"{__name__}: {page.file.src_path}: Could not find cross-reference target '[{ref}]'",
+ )
+
+ return fixed_output
diff --git a/docs/scripts/bibtex.py b/docs/scripts/bibtex.py
new file mode 100644
index 000000000..6064b8741
--- /dev/null
+++ b/docs/scripts/bibtex.py
@@ -0,0 +1,291 @@
+# Based on https://github.com/darwindarak/mdx_bib
+import re
+import string
+from collections import Counter, OrderedDict
+from typing import Tuple
+from xml.etree import ElementTree as etree
+from xml.etree.ElementTree import tostring as etree_to_string
+
+from markdown.extensions import Extension
+from markdown.inlinepatterns import Pattern
+from markdown.preprocessors import Preprocessor
+from mkdocs.config.config_options import Type as MkType
+from mkdocs.plugins import BasePlugin
+from pybtex.database.input import bibtex
+from pybtex.exceptions import PybtexError
+
+BRACKET_RE = re.compile(r"\[([^\[]+)\]")
+CITE_RE = re.compile(r"@([\w_:-]+)")
+DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
+INDENT_RE = re.compile(r"\A\t| {4}(.*)")
+
+CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
+
+
+class Bibliography(object):
+ """Keep track of document references and citations for exporting"""
+
+ def __init__(self, extension, plugin, bibtex_file, order):
+ self.extension = extension
+ self.order = order
+ self.plugin = plugin
+
+ self.citations = OrderedDict()
+ self.references = dict()
+
+ if bibtex_file:
+ try:
+ parser = bibtex.Parser()
+ self.bibsource = parser.parse_file(bibtex_file).entries
+ self.labels = {
+ id: self.formatCitation(self.bibsource[id])
+ for id in self.bibsource.keys()
+ }
+ for value, occurrences in Counter(self.labels.values()).items():
+ if occurrences > 1:
+ for xkey, xvalue in self.labels.items():
+ i = 0
+ if xvalue == value:
+ self.labels[
+ xkey
+ ] = f"{xvalue}{string.ascii_lowercase[i]}"
+ i += 1
+
+ except PybtexError:
+ print("Error loading bibtex file")
+ self.bibsource = dict()
+ self.labels = {}
+ else:
+ self.bibsource = dict()
+
+ def addCitation(self, citekey):
+ self.citations[citekey] = self.citations.get(citekey, 0) + 1
+
+ def setReference(self, citekey, reference):
+ self.references[citekey] = reference
+
+ def citationID(self, citekey):
+ return "cite-" + citekey
+
+ def referenceID(self, citekey):
+ return "ref-" + citekey
+
+ def formatAuthor(self, author):
+ out = (
+ author.last_names[0]
+ + ((" " + author.first_names[0][0]) if author.first_names else "")
+ + "."
+ )
+ if author.middle_names:
+ out += f"{author.middle_names[0][0]}."
+ return out.replace("{", "").replace("}", "")
+
+ def formatAuthorSurname(self, author):
+ out = author.last_names[0]
+ return out.replace("{", "").replace("}", "")
+
+ def formatReference(self, ref):
+ author_list = list(map(self.formatAuthor, ref.persons["author"]))
+
+ if len(author_list) == 1:
+ authors = author_list[0]
+ else:
+ authors = ", ".join(author_list[:-1])
+ authors += f" and {author_list[-1]}"
+
+ # Harvard style
+ # Surname, Initial, ... and Last_Surname,
+ # Initial, Year. Title. Journal, Volume(Issue), pages. doi.
+
+ title = ref.fields["title"].replace("{", "").replace("}", "")
+ journal = ref.fields.get("journal", "")
+ volume = ref.fields.get("volume", "")
+ issue = ref.fields.get("issue", "")
+ year = ref.fields.get("year")
+ pages = ref.fields.get("pages")
+ doi = ref.fields.get("doi")
+
+ ref_id = self.referenceID(ref.key)
+ reference = f"{authors}, {year}. {title}."
+ if journal:
+ reference += f" {journal}."
+ if volume:
+ reference += f" {volume}"
+ if issue:
+ reference += f"({issue})"
+ if pages:
+ reference += f", pp.{pages}"
+ reference += "."
+ if doi:
+ reference += (
+ f' {doi}'
+ )
+ reference += "
"
+
+ return etree.fromstring(reference)
+
+ def formatCitation(self, ref):
+ author_list = list(map(self.formatAuthorSurname, ref.persons["author"]))
+ year = ref.fields.get("year")
+
+ if len(author_list) == 1:
+ citation = f"{author_list[0]}"
+ elif len(author_list) == 2:
+ citation = f"{author_list[0]} and {author_list[1]}"
+ else:
+ citation = f"{author_list[0]} et al."
+
+ citation += f", {year}"
+
+ return citation
+
+ def make_bibliography(self):
+ if self.order == "alphabetical":
+ raise (NotImplementedError)
+
+ div = etree.Element("div")
+ div.set("class", "footnote")
+ div.append(etree.Element("hr"))
+ ol = etree.SubElement(div, "ol")
+
+ if not self.citations:
+ return div
+
+ # table = etree.SubElement(div, "table")
+ # table.set("class", "references")
+ # tbody = etree.SubElement(table, "tbody")
+ etree.SubElement(div, "div")
+ for id in self.citations:
+ li = etree.SubElement(ol, "li")
+ li.set("id", self.referenceID(id))
+ # ref_id = etree.SubElement(li, "td")
+ ref_txt = etree.SubElement(li, "p")
+ if id in self.references:
+ self.extension.parser.parseChunk(ref_txt, self.references[id])
+ elif id in self.bibsource:
+ ref_txt.append(self.formatReference(self.bibsource[id]))
+ else:
+ ref_txt.text = "Missing citation for {}".format(id)
+
+ return div
+
+ def clear_citations(self):
+ self.citations = OrderedDict()
+
+
+class CitationsPreprocessor(Preprocessor):
+ """Gather reference definitions and citation keys"""
+
+ def __init__(self, bibliography):
+ self.bib = bibliography
+
+ def subsequentIndents(self, lines, i):
+ """Concatenate consecutive indented lines"""
+ linesOut = []
+ while i < len(lines):
+ m = INDENT_RE.match(lines[i])
+ if m:
+ linesOut.append(m.group(1))
+ i += 1
+ else:
+ break
+ return " ".join(linesOut), i
+
+ def run(self, lines):
+ linesOut = []
+ i = 0
+
+ while i < len(lines):
+ # Check to see if the line starts a reference definition
+ m = DEF_RE.match(lines[i])
+ if m:
+ key = m.group(1)
+ reference = m.group(2)
+ indents, i = self.subsequentIndents(lines, i + 1)
+ reference += " " + indents
+
+ self.bib.setReference(key, reference)
+ continue
+
+ # Look for all @citekey patterns inside hard brackets
+ for bracket in BRACKET_RE.findall(lines[i]):
+ for c in CITE_RE.findall(bracket):
+ self.bib.addCitation(c)
+ linesOut.append(lines[i])
+ i += 1
+
+ return linesOut
+
+
+class CitationsPattern(Pattern):
+ """Handles converting citations keys into links"""
+
+ def __init__(self, pattern, bibliography):
+ super(CitationsPattern, self).__init__(pattern)
+ self.bib = bibliography
+
+ def handleMatch(self, m):
+ span = etree.Element("span")
+ for cite_match in CITE_RE.finditer(m.group(2)):
+ id = cite_match.group(1)
+ if id in self.bib.bibsource:
+ a = etree.Element("a")
+ a.set("id", self.bib.citationID(id))
+ a.set("href", "./#" + self.bib.referenceID(id))
+ a.set("class", "citation")
+ a.text = self.bib.labels[id]
+ span.append(a)
+ else:
+ continue
+ if len(span) == 0:
+ return None
+ return span
+
+
+context_citations = None
+
+
+class CitationsExtension(Extension):
+ def __init__(self):
+ super(CitationsExtension, self).__init__()
+ self.bib = None
+
+ def extendMarkdown(self, md):
+ md.registerExtension(self)
+ self.parser = md.parser
+ self.md = md
+
+ md.preprocessors.register(CitationsPreprocessor(self.bib), "mdx_bib", 15)
+ md.inlinePatterns.register(
+ CitationsPattern(CITATION_RE, self.bib), "mdx_bib", 175
+ )
+
+
+def makeExtension(*args, **kwargs):
+ return CitationsExtension(*args, **kwargs)
+
+
+class BibTexPlugin(BasePlugin):
+ config_scheme: Tuple[Tuple[str, MkType]] = (
+ ("bibtex_file", MkType(str)), # type: ignore[assignment]
+ ("order", MkType(str, default="unsorted")), # type: ignore[assignment]
+ )
+
+ def __init__(self):
+ self.citations = None
+
+ def on_config(self, config, **kwargs):
+ extension = CitationsExtension()
+ self.bib = Bibliography(
+ extension,
+ self,
+ self.config["bibtex_file"],
+ self.config["order"],
+ )
+ extension.bib = self.bib
+ config["markdown_extensions"].append(extension)
+
+ def on_page_content(self, html, page, config, files):
+ html += "\n" + etree_to_string(self.bib.make_bibliography()).decode()
+ self.bib.clear_citations()
+ return html
diff --git a/docs/scripts/griffe_ext.py b/docs/scripts/griffe_ext.py
new file mode 100644
index 000000000..9f8fd5af4
--- /dev/null
+++ b/docs/scripts/griffe_ext.py
@@ -0,0 +1,102 @@
+import ast
+import importlib
+import inspect
+import sys
+from typing import Union
+
+import astunparse
+from griffe import Extension, Object, ObjectNode, get_logger
+from griffe.docstrings.dataclasses import DocstringSectionParameters
+
+logger = get_logger(__name__)
+
+
+class EDSNLPDocstrings(Extension):
+ def __init__(self):
+ super().__init__()
+
+ self.PIPE_OBJ = {}
+ self.FACT_MEM = {}
+ self.PIPE_TO_FACT = {}
+
+ def on_instance(self, node: Union[ast.AST, ObjectNode], obj: Object) -> None:
+ if (
+ isinstance(node, ast.Assign)
+ and obj.name == "create_component"
+ and isinstance(node.value, ast.Call)
+ and isinstance(node.value.func, ast.Call)
+ ):
+
+ module_name = obj.path.rsplit(".", 1)[0]
+ for name, mod in list(sys.modules.items()):
+ if name.startswith("edspdf"):
+ importlib.reload(mod)
+ module = importlib.reload(importlib.import_module(module_name))
+
+ config_node = node.value.func
+ config_node = next(
+ (kw.value for kw in config_node.keywords if kw.arg == "default_config"),
+ None,
+ )
+ try:
+ default_config = eval(astunparse.unparse(config_node), module.__dict__)
+ except Exception:
+ default_config = {}
+
+ # import object to get its evaluated docstring
+ try:
+ runtime_obj = getattr(module, obj.name)
+ source = inspect.getsource(runtime_obj)
+ self.visit(ast.parse(source))
+ except ImportError:
+ logger.debug(f"Could not get dynamic docstring for {obj.path}")
+ return
+ except AttributeError:
+ logger.debug(f"Object {obj.path} does not have a __doc__ attribute")
+ return
+
+ spec = inspect.getfullargspec(runtime_obj)
+ func_defaults = dict(
+ zip(spec.args[-len(spec.defaults) :], spec.defaults)
+ if spec.defaults
+ else (),
+ **(spec.kwonlydefaults or {}),
+ )
+ defaults = {**func_defaults, **default_config}
+ self.FACT_MEM[obj.path] = (node, obj, defaults)
+ pipe_path = runtime_obj.__module__ + "." + runtime_obj.__name__
+ self.PIPE_TO_FACT[pipe_path] = obj.path
+
+ if pipe_path in self.PIPE_OBJ:
+ pipe = self.PIPE_OBJ[pipe_path]
+ obj.docstring = pipe.docstring
+ else:
+ return
+ elif obj.is_class or obj.is_function:
+ self.PIPE_OBJ[obj.path] = obj
+ if obj.path in self.PIPE_TO_FACT:
+ node, fact_obj, defaults = self.FACT_MEM[self.PIPE_TO_FACT[obj.path]]
+ fact_obj.docstring = obj.docstring
+ obj = fact_obj
+ else:
+ return
+ else:
+ return
+
+ if obj.docstring is None:
+ return
+
+ param_section: DocstringSectionParameters = None
+ obj.docstring.parser = "numpy"
+ for section in obj.docstring.parsed:
+ if isinstance(section, DocstringSectionParameters):
+ param_section = section # type: ignore
+
+ if param_section is None:
+ return
+
+ for param in param_section.value:
+ if param.name in defaults:
+ param.default = str(defaults[param.name])
+ if param.default is not None and len(param.default) > 50:
+ param.default = param.default[: 50 - 3] + "..."
diff --git a/docs/scripts/plugin.py b/docs/scripts/plugin.py
index 2bb73a3b6..4e77a6710 100644
--- a/docs/scripts/plugin.py
+++ b/docs/scripts/plugin.py
@@ -1,85 +1,120 @@
import os
-import shutil
from pathlib import Path
-import mkdocs
+import mkdocs.config
+import mkdocs.structure
+import mkdocs.structure.files
+import mkdocs.structure.nav
-# Add the files from the project root
-files = [
- "changelog.md",
- "contributing.md",
-]
+def exclude_file(name):
+ return name.startswith("assets/fragments/")
-docs_gen = Path("docs")
-os.makedirs(docs_gen, exist_ok=True)
-for f in files:
- with open(docs_gen / Path(f), "w") as fd:
- fd.write(Path(f).read_text())
+# Add the files from the project root
-# Generate the code reference pages and navigation.
-doc_reference = Path("docs/reference")
-shutil.rmtree(doc_reference, ignore_errors=True)
-os.makedirs(doc_reference, exist_ok=True)
+VIRTUAL_FILES = {}
+REFERENCE_TEMPLATE = """
+# `{ident}`
+::: {ident}
+ options:
+ show_source: false
+"""
-for path in sorted(Path("edsnlp").rglob("*.py")):
- module_path = path.relative_to(".").with_suffix("")
- doc_path = path.relative_to("edsnlp").with_suffix(".md")
- full_doc_path = doc_reference / doc_path
- parts = list(module_path.parts)
+def on_files(files: mkdocs.structure.files.Files, config: mkdocs.config.Config):
+ """
+ Recursively the navigation of the mkdocs config
+ and recursively content of directories of page that point
+ to directories.
- if parts[-1] == "__init__":
- parts = parts[:-1]
- doc_path = doc_path.with_name("index.md")
- full_doc_path = full_doc_path.with_name("index.md")
- elif parts[-1] == "__main__":
- continue
+ Parameters
+ ----------
+ config: mkdocs.config.Config
+ The configuration object
+ kwargs: dict
+ Additional arguments
+ """
- ident = ".".join(parts)
+ root = Path("edsnlp")
+ reference_nav = []
+ for path in sorted(root.rglob("*.py")):
+ module_path = path.relative_to(root.parent).with_suffix("")
+ doc_path = Path("reference") / path.relative_to(root.parent).with_suffix(".md")
+ # full_doc_path = Path("docs/reference/") / doc_path
+ parts = list(module_path.parts)
+ current = reference_nav
+ for part in parts[:-1]:
+ sub = next((item[part] for item in current if part in item), None)
+ if sub is None:
+ current.append({part: []})
+ sub = current[-1][part]
+ current = sub
+ if parts[-1] == "__init__":
+ parts = parts[:-1]
+ doc_path = doc_path.with_name("index.md")
+ current.append({"index.md": str(doc_path)})
+ elif parts[-1] == "__main__":
+ continue
+ else:
+ current.append({parts[-1]: str(doc_path)})
+ ident = ".".join(parts)
+ os.makedirs(doc_path.parent, exist_ok=True)
+ VIRTUAL_FILES[str(doc_path)] = REFERENCE_TEMPLATE.format(ident=ident)
- os.makedirs(full_doc_path.parent, exist_ok=True)
- with open(full_doc_path, "w") as fd:
- print(f"# `{ident}`\n", file=fd)
- print("::: " + ident, file=fd)
+ for item in config["nav"]:
+ if not isinstance(item, dict):
+ continue
+ key = next(iter(item.keys()))
+ if not isinstance(item[key], str):
+ continue
+ if item[key].strip("/") == "reference":
+ item[key] = reference_nav
+ VIRTUAL_FILES["contributing.md"] = Path("contributing.md").read_text()
+ VIRTUAL_FILES["changelog.md"] = Path("changelog.md").read_text()
-def on_files(files: mkdocs.structure.files.Files, config: mkdocs.config.Config) -> None:
- """
- Updates the navigation to take code reference files into account
- """
- reference_files = []
- for file in files:
- if file.src_path.startswith("reference/"):
- current = reference_files
- parts = ["edsnlp"] + file.src_path.replace(".md", "").split("/")[1:]
- for part in parts[:-1]:
- entry = next(
- (
- next(iter(entry.values()))
- for entry in current
- if next(iter(entry.keys())) == part
- ),
- None,
+ return mkdocs.structure.files.Files(
+ [file for file in files if not exclude_file(file.src_path)]
+ + [
+ mkdocs.structure.files.File(
+ file,
+ config["docs_dir"],
+ config["site_dir"],
+ config["use_directory_urls"],
+ )
+ for file in VIRTUAL_FILES
+ ]
+ )
+
+
+def on_nav(nav, config, files):
+ def rec(node):
+ if isinstance(node, list):
+ return [rec(item) for item in node]
+ if node.is_section and node.title == "Code Reference":
+ return
+ if isinstance(node, mkdocs.structure.nav.Navigation):
+ return rec(node.items)
+ if isinstance(node, mkdocs.structure.nav.Section):
+ if (
+ len(node.children)
+ and node.children[0].is_page
+ and not node.children[0].is_index
+ ):
+ first = node.children[0]
+ link = mkdocs.structure.nav.Link(
+ title=first.title,
+ url=first.url,
)
- if entry is None:
- entry = []
- current.append({part: entry})
- current = entry
- else:
- current = entry
- current.append({parts[-1]: file.src_path})
-
- def rec(tree):
- if isinstance(tree, str) and tree.strip("/") == "reference":
- return reference_files
- elif isinstance(tree, list):
- return [rec(item) for item in tree]
- elif isinstance(tree, dict):
- return {k: rec(item) for k, item in tree.items()}
- else:
- return tree
+ link.is_index = True
+ node.children.append(link)
+ return rec(node.children)
+
+ rec(nav.items)
+
- new_nav = rec(config["nav"])
- config["nav"] = new_nav
+def on_page_read_source(page, config):
+ if page.file.src_path in VIRTUAL_FILES:
+ return VIRTUAL_FILES[page.file.src_path]
+ return None
diff --git a/docs/tokenizers.md b/docs/tokenizers.md
index cecb26df2..45b3ea0ea 100644
--- a/docs/tokenizers.md
+++ b/docs/tokenizers.md
@@ -9,7 +9,7 @@ A comparison of the two tokenization methods is demonstrated below:
| Example | FrenchLanguage | EDSLanguage |
|--------------------|---------------------------|-------------------------------------------|
-| `ACR 5` | \[`ACR5`\] | \[`ACR`, `5`\] |
+| `ACR5` | \[`ACR5`\] | \[`ACR`, `5`\] |
| `26.5/` | \[`26.5/`\] | \[`26.5`, `/`\] |
| `\n \n CONCLUSION` | \[`\n \n`, `CONCLUSION`\] | \[`\n`, `\n`, `CONCLUSION`\] |
| `l'artère` | \[`l'`, `artère`\] | \[`l'`, `artère`\] (same) |
diff --git a/docs/tutorials/aggregating-results.md b/docs/tutorials/aggregating-results.md
index 262dd1e50..bc1919687 100644
--- a/docs/tutorials/aggregating-results.md
+++ b/docs/tutorials/aggregating-results.md
@@ -13,9 +13,7 @@ In some cases, you are not interested in individual extractions, but rather in d
Below is a simple implementation of this aggregation rule (this can be adapted for other comorbidity components and other qualification methods):
-
-
-```python
+```{ .python .no-check }
MIN_NUMBER_ENTITIES = 2 # (1)!
if not Doc.has_extension("aggregated"):
@@ -23,7 +21,7 @@ if not Doc.has_extension("aggregated"):
spans = doc.spans["diabetes"] # (3)!
kept_spans = [
- (span, span._.status, span._.detailled_status)
+ (span, span._.status, span._.detailed_status)
for span in spans
if not any([span._.negation, span._.hypothesis, span._.family])
] # (4)!
diff --git a/docs/tutorials/detecting-dates.md b/docs/tutorials/detecting-dates.md
index 05410e65a..e29a1843b 100644
--- a/docs/tutorials/detecting-dates.md
+++ b/docs/tutorials/detecting-dates.md
@@ -46,7 +46,7 @@ The followings snippet adds the `eds.date` component to the pipeline:
```python
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.dates") # (1)
text = (
@@ -117,7 +117,7 @@ whether a given entity can be linked to a date.
import spacy
from datetime import datetime
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.sentences")
nlp.add_pipe("eds.dates")
@@ -219,13 +219,12 @@ def get_event_date(ent: Span) -> Optional[Span]:
We can apply this simple function:
-
-
-```python
+```{ .python .no-check }
import spacy
from utils import get_event_date
+from datetime import datetime
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.sentences")
nlp.add_pipe("eds.dates")
@@ -242,17 +241,20 @@ text = (
)
doc = nlp(text)
+now = datetime.now()
for ent in doc.ents:
+ if ent.label_ != "admission":
+ continue
date = get_event_date(ent)
- print(f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime()}")
-# Out: admis 12 avril 2020 2020-04-12T00:00:00+02:00
-# Out: pris en charge l'année dernière -1 year
+ print(f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime(now).strftime('%d/%m/%Y'):<15}{date._.date.to_duration(now)}")
+# Out: admis 12 avril 12/04/2023 21 weeks 4 days 6 hours 3 minutes 26 seconds
+# Out: pris en charge l'année dernière 10/09/2022 -1 year
```
Which will output:
| `ent` | `get_event_date(ent)` | `get_event_date(ent)._.date.to_datetime()` |
-| -------------- | --------------------- | ----------------------------------------- |
-| admis | 12 avril | `2020-04-12T00:00:00+02:00` |
-| pris en charge | l'année dernière | `-1 year` |
+|----------------|-----------------------|--------------------------------------------|
+| admis | 12 avril | `2020-04-12T00:00:00+02:00` |
+| pris en charge | l'année dernière | `-1 year` |
diff --git a/docs/tutorials/endlines.md b/docs/tutorials/endlines.md
index 110ed732d..ba30e16f8 100644
--- a/docs/tutorials/endlines.md
+++ b/docs/tutorials/endlines.md
@@ -25,9 +25,9 @@ Let's train the model using an example corpus of three documents:
```python
import spacy
-from edsnlp.pipelines.core.endlines import EndLinesModel
+from edsnlp.pipelines.core.endlines.model import EndLinesModel
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
text1 = """Le patient est arrivé hier soir.
Il est accompagné par son fils
@@ -70,18 +70,16 @@ PATH = "/tmp/path_to_model"
endlines.save(PATH)
```
-1. Initialize the [`EndLinesModel`][edsnlp.pipelines.core.endlines.endlinesmodel.EndLinesModel]
+1. Initialize the [`EndLinesModel`][edsnlp.pipelines.core.endlines.model.EndLinesModel]
object and then fit (and predict) in the training corpus.
2. The corpus should be an iterable of spacy documents.
## Use a trained model for inference
-
-
-```python
+```{ .python .no-check }
import spacy
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
PATH = "/path_to_model"
nlp.add_pipe("eds.endlines", config=dict(model_path=PATH)) # (1)
@@ -104,6 +102,6 @@ list(doc.sents)[0]
## Declared extensions
-It lets downstream matchers skip excluded tokens (see [normalisation](../pipelines/core/normalisation.md)) for more detail.
+It lets downstream matchers skip excluded tokens (see [normalisation](../pipelines/core/normalizer.md)) for more detail.
\bibliography
diff --git a/docs/tutorials/matching-a-terminology.md b/docs/tutorials/matching-a-terminology.md
index 9913ca209..9f61c68d5 100644
--- a/docs/tutorials/matching-a-terminology.md
+++ b/docs/tutorials/matching-a-terminology.md
@@ -35,7 +35,7 @@ terms = dict(
respiratoire=["asthmatique", "respiratoire"],
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe("eds.matcher", config=dict(terms=terms))
doc = nlp(text)
@@ -79,8 +79,6 @@ Let's focus on two:
Matching on the lowercased version is extremely easy:
-
-
```python
import spacy
@@ -95,7 +93,7 @@ terms = dict(
respiratoire=["asthmatique", "respiratoire", "respiratoires"],
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe(
"eds.matcher",
config=dict(
@@ -152,7 +150,7 @@ terms = dict(
respiratoire=["asthmatique", "respiratoire", "respiratoires"],
)
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
# Add the normalisation component
nlp.add_pipe("eds.normalizer") # (3)
@@ -176,7 +174,7 @@ doc.ents
2. We've added `pneumopathie à covid19` to the list of synonyms detected by the pipeline.
Note that in the synonym we provide, we kept the accentuated `à`, whereas the example
displays an unaccentuated `a`.
-3. The component can be configured. See the [specific documentation](../pipelines/core/normalisation.md) for detail.
+3. The component can be configured. See the [specific documentation](../pipelines/core/normalizer.md) for detail.
4. The normalisation lives in the `NORM` attribute
5. We can tell the matcher to ignore excluded tokens (tokens tagged as pollution by the normalisation component).
This is not an obligation.
@@ -218,7 +216,7 @@ regex = dict(
)
terms = dict(respiratoire="asthmatique")
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
nlp.add_pipe(
"eds.matcher",
config=dict(
diff --git a/docs/tutorials/multiple-texts.md b/docs/tutorials/multiple-texts.md
index 0d9238cab..bdda88146 100644
--- a/docs/tutorials/multiple-texts.md
+++ b/docs/tutorials/multiple-texts.md
@@ -9,7 +9,7 @@ These can drastically increase throughput.
Consider this simple pipeline:
-```python title="Pipeline definition: pipeline.py"
+```python
import spacy
nlp = spacy.blank("fr")
@@ -118,9 +118,7 @@ To make sure we can follow along, we propose three recipes for getting the DataF
=== "Loading data from a CSV"
-
-
- ```python
+ ```{ .python .no-check }
import pandas as pd
data = pd.read_csv("note.csv")
@@ -128,9 +126,7 @@ To make sure we can follow along, we propose three recipes for getting the DataF
=== "Loading data from a Spark DataFrame"
-
-
- ```python
+ ```{ .python .no-check }
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
@@ -149,7 +145,7 @@ We'll see in what follows how we can efficiently deploy our pipeline on the `#!p
We can deploy the pipeline using `nlp.pipe` directly, but we'll need some work to format the results in a usable way. Let's see how this might go, before using EDS-NLP's helper function to avoid the boilerplate code.
-```python title="processing.py"
+```python
from spacy.tokens import Doc
from typing import Any, Dict, List
@@ -187,9 +183,8 @@ def get_entities(doc: Doc) -> List[Dict[str, Any]]:
-```python
+```{ .python .no-check }
# ↑ Omitted code above ↑
-from processing import get_entities
import pandas as pd
data["doc"] = list(nlp.pipe(data.note_text)) # (1)
@@ -241,9 +236,7 @@ They share the same arguments:
In this case, you can use the `context` parameter and provide a list of column names you want to add:
-
-
- ```python
+ ```{ .python .no-check }
note_nlp = single_pipe(
data,
nlp,
@@ -266,7 +259,7 @@ For instance, the `get_entities` function defined earlier could be distributed d
-```python
+```{ .python .no-check }
# ↑ Omitted code above ↑
from edsnlp.processing.simple import pipe as single_pipe
from processing import get_entities
@@ -368,9 +361,7 @@ Suppose you have a Spark DataFrame:
=== "Loading a pre-existing table"
-
-
- ```python
+ ```{ .python .no-check }
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
@@ -381,9 +372,7 @@ Suppose you have a Spark DataFrame:
=== "Using a Koalas DataFrame"
-
-
- ```python
+ ```{ .python .no-check }
from pyspark.sql.session import SparkSession
import databricks.koalas
@@ -400,9 +389,7 @@ Accepted types are the ones present in [`pyspark.sql.types`](https://spark.apach
EDS-NLP provides a helper function, [`pyspark_type_finder`][edsnlp.processing.distributed.pyspark_type_finder], is available to get the correct type for most Python objects. You just need to provide an example of the type you wish to collect:
-
-
-```python
+```{ .python .no-check }
int_type = pyspark_type_finder(1)
# Out: IntegerType()
@@ -419,9 +406,7 @@ Once again, using the helper is trivial:
=== "Spark"
-
-
- ```python
+ ```{ .python .no-check }
# ↑ Omitted code above ↑
from edsnlp.processing.distributed import pipe as distributed_pipe
@@ -438,9 +423,7 @@ Once again, using the helper is trivial:
=== "Koalas"
-
-
- ```python
+ ```{ .python .no-check }
# ↑ Omitted code above ↑
from edsnlp.processing.distributed import pipe as distributed_pipe
@@ -461,9 +444,7 @@ Using Spark or Koalas, you can deploy EDS-NLP pipelines on tens of millions of d
EDS-NLP provides a wrapper to simplify deployment even further:
-
-
-```python
+```{ .python .no-check }
# ↑ Omitted code above ↑
from edsnlp.processing import pipe
diff --git a/docs/tutorials/quick-examples.md b/docs/tutorials/quick-examples.md
index a8fefd4b0..237470ada 100644
--- a/docs/tutorials/quick-examples.md
+++ b/docs/tutorials/quick-examples.md
@@ -8,9 +8,7 @@ If you are
you might want to quickly apply a pipeline and display the output `doc` in a comprehensible way.
-
-
-```python
+ ```{ .python .no-check }
from edsnlp.viz import QuickExample
E = QuickExample(nlp) # (1)
@@ -20,9 +18,7 @@ E = QuickExample(nlp) # (1)
Next, simply call `E` with any string:
-
-
-```python
+ ```{ .python .no-check }
txt = "Le patient présente une anomalie."
E(txt)
```
@@ -40,17 +36,13 @@ E(txt)
By default, each `Qualifiers` in `nlp` adds a corresponding column to the output. Additionnal informations can be displayed by using the `extensions` parameter. For instance, if entities have a custom `ent._.custom_ext` extensions, it can be displayed by providing the extension when instantiating `QuickExample`:
-
-
-```python
+ ```{ .python .no-check }
E = QuickExample(nlp, extensions=["_.custom_ext"])
```
Finally, if you prefer to output a DataFrame instead of displaying a table, set the `as_dataframe` parameter to True:
-
-
-```python
+ ```{ .python .no-check }
E = QuickExample(nlp)
E(txt, as_dataframe=True)
```
diff --git a/docs/tutorials/reason.md b/docs/tutorials/reason.md
index 818230880..e9431e50b 100644
--- a/docs/tutorials/reason.md
+++ b/docs/tutorials/reason.md
@@ -83,9 +83,7 @@ for e in entities:
We can verify that named entities that do not overlap with the spans of reason, have their attribute `#!python reason._.is_reason == False`:
-
-
-```python
+```{ .python .no-check }
for e in doc.ents:
print(e.start, e, e._.is_reason)
# Out: 42 asthme True
diff --git a/docs/utilities/connectors/brat.md b/docs/utilities/connectors/brat.md
index ed3fffd68..e0aee6f77 100644
--- a/docs/utilities/connectors/brat.md
+++ b/docs/utilities/connectors/brat.md
@@ -17,9 +17,7 @@ T3 Drug 79 90 paracétamol
The point of the BRAT connector is to go from the standoff annotation format to an annotated spaCy document :
-
-
-```python
+```{ .python .no-check }
import spacy
from edsnlp.connectors.brat import BratConnector
@@ -27,7 +25,7 @@ from edsnlp.connectors.brat import BratConnector
brat = BratConnector("path/to/brat")
# Instantiate the spacy pipeline
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
# Convert all BRAT files to a list of documents
docs = brat.brat2docs(nlp)
diff --git a/docs/utilities/connectors/omop.md b/docs/utilities/connectors/omop.md
index 565a73e2c..7137b8147 100644
--- a/docs/utilities/connectors/omop.md
+++ b/docs/utilities/connectors/omop.md
@@ -30,14 +30,12 @@ And its OMOP-style representation, separated in two tables `note` and `note_nlp`
The following snippet expects the tables `note` and `note_nlp` to be already defined (eg through PySpark's `toPandas()` method).
-
-
-```python
+```{ .python .no-check }
import spacy
from edsnlp.connectors.omop import OmopConnector
# Instantiate a spacy pipeline
-nlp = spacy.blank("fr")
+nlp = spacy.blank("eds")
# Instantiate the connector
connector = OmopConnector(nlp)
diff --git a/docs/utilities/tests/blocs.md b/docs/utilities/tests/blocs.md
index 21bee1513..42a6edb0a 100644
--- a/docs/utilities/tests/blocs.md
+++ b/docs/utilities/tests/blocs.md
@@ -25,9 +25,7 @@ assert repr(v) == "1"
We can disable code checking for a specific code bloc by adding `` above it:
````md
-
-
-```python
+```{ .python .no-check }
test = undeclared_function(42)
```
````
diff --git a/edsnlp/components.py b/edsnlp/components.py
deleted file mode 100644
index 30198e08c..000000000
--- a/edsnlp/components.py
+++ /dev/null
@@ -1 +0,0 @@
-from edsnlp.pipelines.factories import * # noqa : used to import pipelines
diff --git a/edsnlp/language.py b/edsnlp/language.py
index ccb750db7..145cd4c2e 100644
--- a/edsnlp/language.py
+++ b/edsnlp/language.py
@@ -42,7 +42,7 @@ class EDSLanguage(French):
default_config = Defaults
-TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i)(?:ep\.)"]
+TOKENIZER_EXCEPTIONS = [r"Dr\.", r"Pr\.", r"M\.", r"Mme\.", r"Mlle\.", r"(?i:(?:ep\.))"]
class EDSTokenizer(DummyTokenizer):
diff --git a/edsnlp/patch_spacy_dot_components.py b/edsnlp/patch_spacy_dot_components.py
index 7f1b62dab..61383e096 100644
--- a/edsnlp/patch_spacy_dot_components.py
+++ b/edsnlp/patch_spacy_dot_components.py
@@ -5,7 +5,6 @@
from spacy.errors import Errors
from spacy.language import FactoryMeta
from spacy.pipe_analysis import validate_attrs
-from spacy.pipeline import Pipe
from spacy.util import SimpleFrozenDict, SimpleFrozenList, registry
@@ -51,10 +50,11 @@ def factory(
if not isinstance(name, str):
raise ValueError(Errors.E963.format(decorator="factory"))
if not isinstance(default_config, dict):
- err = Errors.E962.format(
- style="default config", name=name, cfg_type=type(default_config)
+ raise ValueError(
+ Errors.E962.format(
+ style="default config", name=name, cfg_type=type(default_config)
+ )
)
- raise ValueError(err)
def add_factory(factory_func: Callable) -> Callable:
internal_name = cls.get_factory_name(name)
@@ -102,77 +102,4 @@ def add_factory(factory_func: Callable) -> Callable:
return add_factory
-@classmethod
-def component(
- cls,
- name: str,
- *,
- assigns: Iterable[str] = SimpleFrozenList(),
- requires: Iterable[str] = SimpleFrozenList(),
- retokenizes: bool = False,
- func: Optional["Pipe"] = None,
-) -> Callable[..., Any]:
- """
- Patched from spaCy to allow back dots in factory
- names (https://github.com/aphp/edsnlp/pull/152)
-
- Register a new pipeline component. Can be used for stateless function
- components that don't require a separate factory. Can be used as a
- decorator on a function or classmethod, or called as a function with the
- factory provided as the func keyword argument. To create a component and
- add it to the pipeline, you can use nlp.add_pipe(name).
-
- name (str): The name of the component factory.
- assigns (Iterable[str]): Doc/Token attributes assigned by this component,
- e.g. "token.ent_id". Used for pipeline analysis.
- requires (Iterable[str]): Doc/Token attributes required by this component,
- e.g. "token.ent_id". Used for pipeline analysis.
- retokenizes (bool): Whether the component changes the tokenization.
- Used for pipeline analysis.
- func (Optional[Callable]): Factory function if not used as a decorator.
-
- DOCS: https://spacy.io/api/language#component
- """
- if name is not None:
- if not isinstance(name, str):
- raise ValueError(Errors.E963.format(decorator="component"))
- component_name = name if name is not None else util.get_object_name(func)
-
- def add_component(component_func: "Pipe") -> Callable:
- if isinstance(func, type): # function is a class
- raise ValueError(Errors.E965.format(name=component_name))
-
- def factory_func(nlp, name: str) -> "Pipe":
- return component_func
-
- internal_name = cls.get_factory_name(name)
- if internal_name in registry.factories:
- # We only check for the internal name here – it's okay if it's a
- # subclass and the base class has a factory of the same name. We
- # also only raise if the function is different to prevent raising
- # if module is reloaded. It's hacky, but we need to check the
- # existing functure for a closure and whether that's identical
- # to the component function (because factory_func created above
- # will always be different, even for the same function)
- existing_func = registry.factories.get(internal_name)
- closure = existing_func.__closure__
- wrapped = [c.cell_contents for c in closure][0] if closure else None
- if util.is_same_func(wrapped, component_func):
- factory_func = existing_func # noqa: F811
-
- cls.factory(
- component_name,
- assigns=assigns,
- requires=requires,
- retokenizes=retokenizes,
- func=factory_func,
- )
- return component_func
-
- if func is not None: # Support non-decorator use cases
- return add_component(func)
- return add_component
-
-
spacy.Language.factory = factory
-spacy.Language.component = component
diff --git a/edsnlp/pipelines/base.py b/edsnlp/pipelines/base.py
index 737c69061..685406307 100644
--- a/edsnlp/pipelines/base.py
+++ b/edsnlp/pipelines/base.py
@@ -1,10 +1,25 @@
+from collections import defaultdict
from operator import attrgetter
-from typing import List, Optional, Tuple
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+)
+from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.utils.filter import filter_spans
-class BaseComponent(object):
+
+class BaseComponent:
"""
The `BaseComponent` adds a `set_extensions` method,
called at the creation of the object.
@@ -15,17 +30,23 @@ class BaseComponent(object):
imposes that the extensions be reset.
"""
- def __init__(self, *args, **kwargs):
+ def __init__(self, nlp: Language = None, name: str = None, *args, **kwargs):
super().__init__(*args, **kwargs)
-
+ self.nlp = nlp
+ self.name = name
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
+ def set_extensions(self):
"""
Set `Doc`, `Span` and `Token` extensions.
"""
- pass
+ Span.set_extension(
+ "value",
+ getter=lambda span: span._.get(span.label_)
+ if span._.has(span.label_)
+ else None,
+ force=True,
+ )
def get_spans(self, doc: Doc):
"""
@@ -81,3 +102,217 @@ def _boundaries(
boundaries = [(start, end) for start, end in zip(starts[:-1], starts[1:])]
return boundaries
+
+
+SeqStr = Union[str, Sequence[str]]
+SpanFilter = Union[bool, SeqStr]
+
+SpanSetterMapping = Dict[str, SpanFilter]
+SpanGetterMapping = Dict[str, SpanFilter]
+
+SpanGetter = Union[
+ SpanGetterMapping,
+ Callable[[Doc], Iterable[Span]],
+]
+SpanSetter = Union[
+ SpanSetterMapping,
+ Callable[[Doc, Iterable[Span]], Any],
+]
+
+
+def get_spans(doc, span_getter):
+ if callable(span_getter):
+ yield from span_getter(doc)
+ return
+ for key, span_filter in span_getter.items():
+ candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents
+ if span_filter is True:
+ yield from candidates
+ else:
+ for span in candidates:
+ if span.label_ in span_filter:
+ yield span
+
+
+def validate_span_setter(value: Union[SeqStr, Dict[str, SpanFilter]]) -> SpanSetter:
+ if callable(value):
+ return value
+ if isinstance(value, str):
+ return {value: True}
+ if isinstance(value, list):
+ return {group: True for group in value}
+ elif isinstance(value, dict):
+ new_value = {}
+ for k, v in value.items():
+ if isinstance(v, bool):
+ new_value[k] = v
+ elif isinstance(v, str):
+ new_value[k] = [v]
+ elif isinstance(v, list) and all(isinstance(i, str) for i in v):
+ new_value[k] = v
+ else:
+ raise TypeError(
+ f"Invalid entry {value} ({type(value)}) for SpanSetterArg, "
+ f"expected bool/string(s), dict of bool/string(s) or callable"
+ )
+ return new_value
+ else:
+ raise TypeError(
+ f"Invalid entry {value} ({type(value)}) for SpanSetterArg, "
+ f"expected bool/string(s), dict of bool/string(s) or callable"
+ )
+
+
+def validate_span_getter(
+ value: Union[SeqStr, Dict[str, SpanFilter]], optional: bool = False
+) -> SpanSetter:
+ if value is None:
+ if optional:
+ return None
+ raise ValueError(
+ "Mising entry for SpanGetterArg, "
+ "expected bool/string(s), dict of bool/string(s) or callable"
+ )
+ if callable(value):
+ return value
+ if isinstance(value, str):
+ return {value: True}
+ if isinstance(value, list):
+ return {group: True for group in value}
+ elif isinstance(value, dict):
+ new_value = {}
+ for k, v in value.items():
+ if isinstance(v, bool):
+ new_value[k] = v
+ elif isinstance(v, str):
+ new_value[k] = [v]
+ elif isinstance(v, list) and all(isinstance(i, str) for i in v):
+ new_value[k] = v
+ else:
+ raise TypeError(
+ f"Invalid entry {value} ({type(value)}) for SpanGetterArg, "
+ f"expected bool/string(s), dict of bool/string(s) or callable"
+ )
+ return new_value
+ else:
+ raise TypeError(
+ f"Invalid entry {value} ({type(value)}) for SpanGetterArg, "
+ f"expected bool/string(s), dict of bool/string(s) or callable"
+ )
+
+
+class SpanSetterArg:
+ """
+ Valid values for the `span_setter` argument of a component can be :
+
+ - a (doc, matches) -> None callable
+ - a span group name
+ - a list of span group names
+ - a dict of group name to True or list of labels
+
+ The group name `"ents"` is a special case, and will add the matches to `doc.ents`
+
+ Examples
+ --------
+ - `span_setter=["ents", "ckd"]` will add the matches to both `doc.ents` and
+ `doc.spans["ckd"]`. It is equivalent to `{"ents": True, "ckd": True}`.
+ - `span_setter={"ents": ["foo", "bar"]}` will add the matches with label
+ "foo" and "bar" to `doc.ents`.
+ - `span_setter="ents"` will add all matches only to `doc.ents`.
+ - `span_setter="ckd"` will add all matches only to `doc.spans["ckd"]`.
+ """
+
+ @classmethod
+ def __get_validators__(cls):
+ yield cls.validate
+
+ @classmethod
+ def validate(cls, value: Union[SeqStr, Dict[str, SpanFilter]]) -> SpanSetter:
+ return validate_span_setter(value)
+
+
+class SpanGetterArg:
+ """
+ Valid values for the `span_getter` argument of a component can be :
+
+ - a (doc) -> spans callable
+ - a span group name
+ - a list of span group names
+ - a dict of group name to True or list of labels
+
+ The group name `"ents"` is a special case, and will get the matches from `doc.ents`
+
+ Examples
+ --------
+ - `span_getter=["ents", "ckd"]` will get the matches from both `doc.ents` and
+ `doc.spans["ckd"]`. It is equivalent to `{"ents": True, "ckd": True}`.
+ - `span_getter={"ents": ["foo", "bar"]}` will get the matches with label
+ "foo" and "bar" from `doc.ents`.
+ - `span_getter="ents"` will get all matches from `doc.ents`.
+ - `span_getter="ckd"` will get all matches from `doc.spans["ckd"]`.
+ """
+
+ @classmethod
+ def __get_validators__(cls):
+ yield cls.validate
+
+ @classmethod
+ def validate(cls, value: Union[SeqStr, Dict[str, SpanFilter]]) -> SpanSetter:
+ return validate_span_setter(value)
+
+
+class BaseNERComponent(BaseComponent):
+ def __init__(
+ self,
+ nlp: Language = None,
+ name: str = None,
+ *args,
+ span_setter: SpanSetterArg,
+ **kwargs,
+ ):
+ super().__init__(nlp, name, *args, **kwargs)
+ self.span_setter: SpanSetter = validate_span_setter(span_setter) # type: ignore
+
+ def set_spans(self, doc, matches):
+ if callable(self.span_setter):
+ self.span_setter(doc, matches)
+ else:
+
+ match_all = []
+ label_to_group = defaultdict(list)
+ for name, spans_filter in self.span_setter.items():
+ if name != "ents":
+ doc.spans.setdefault(name, [])
+ if spans_filter:
+ if spans_filter is True:
+ match_all.append(name)
+ else:
+ for label in spans_filter:
+ label_to_group[label].append(name)
+
+ new_ents = [] if "ents" in self.span_setter else None
+
+ for span in matches:
+ for group in match_all + label_to_group[span.label_]:
+ if group == "ents":
+ new_ents.append(span)
+ else:
+ doc.spans[group].append(span)
+ if new_ents is not None:
+ doc.ents = filter_spans((*new_ents, *doc.ents))
+ return doc
+
+
+if TYPE_CHECKING:
+ SpanGetterArg = Union[ # noqa: F811
+ str,
+ Sequence[str],
+ SpanGetterMapping,
+ Callable[[Doc], Iterable[Span]],
+ ]
+ SpanSetterArg = Union[ # noqa: F811
+ str,
+ Sequence[str],
+ SpanSetterMapping,
+ Callable[[Doc, Iterable[Span]], Any],
+ ]
diff --git a/edsnlp/pipelines/core/context/context.py b/edsnlp/pipelines/core/context/context.py
index 698e4fb5b..4418a2237 100644
--- a/edsnlp/pipelines/core/context/context.py
+++ b/edsnlp/pipelines/core/context/context.py
@@ -13,7 +13,7 @@ class ContextAdder(BaseComponent):
Parameters
----------
nlp : Language
- The spaCy object.
+ The pipeline object
context : List[str]
The list of extensions to add to the `Doc`
"""
diff --git a/edsnlp/pipelines/core/context/factory.py b/edsnlp/pipelines/core/context/factory.py
index 2f8d85da4..ee5e6eae2 100644
--- a/edsnlp/pipelines/core/context/factory.py
+++ b/edsnlp/pipelines/core/context/factory.py
@@ -9,14 +9,12 @@
)
-@Language.factory(
- "eds.context",
- default_config=DEFAULT_CONFIG,
-)
+@Language.factory("eds.context")
def create_component(
nlp: Language,
name: str,
- context: List[str],
+ *,
+ context: List[str] = ["note_id"],
):
return ContextAdder(
diff --git a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
index 5831fd409..708c7d190 100644
--- a/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
+++ b/edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py
@@ -1,8 +1,9 @@
import re
+import warnings
from collections import defaultdict
from functools import lru_cache
from operator import attrgetter
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
from loguru import logger
from spacy.language import Language
@@ -11,8 +12,7 @@
from edsnlp.matchers.phrase import EDSPhraseMatcher
from edsnlp.matchers.regex import RegexMatcher, create_span
from edsnlp.matchers.utils import get_text
-from edsnlp.pipelines.base import BaseComponent
-from edsnlp.utils.filter import filter_spans
+from edsnlp.pipelines.base import BaseNERComponent, SpanSetterArg
from edsnlp.utils.lists import flatten
from . import models
@@ -31,7 +31,7 @@ def get_window(
return doclike.doc[start:end]
-class ContextualMatcher(BaseComponent):
+class ContextualMatcher(BaseNERComponent):
"""
Allows additional matching in the surrounding context of the main match group,
for qualification/filtering.
@@ -40,9 +40,9 @@ class ContextualMatcher(BaseComponent):
----------
nlp : Language
spaCy `Language` object.
- name : str
+ name : Optional[str]
The name of the pipe
- patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ patterns : Union[Dict[str, Any], List[Dict[str, Any]]]
The configuration dictionary
assign_as_span : bool
Whether to store eventual extractions defined via the `assign` key as Spans
@@ -51,7 +51,7 @@ class ContextualMatcher(BaseComponent):
Attribute to match on, eg `TEXT`, `NORM`, etc.
ignore_excluded : bool
Whether to skip excluded tokens during matching.
- ignore_space_tokens: bool
+ ignore_space_tokens : bool
Whether to skip space tokens during matching.
alignment_mode : str
Overwrite alignment mode.
@@ -60,12 +60,19 @@ class ContextualMatcher(BaseComponent):
[here](https://docs.python.org/3/library/re.html#flags))
include_assigned : bool
Whether to include (eventual) assign matches to the final entity
+ label_name : Optional[str]
+ Deprecated, use `label` instead. The label to assign to the matched entities
+ label : str
+ The label to assign to the matched entities
+ span_setter : SpanSetterArg
+ How to set matches on the doc
"""
def __init__(
self,
- nlp: Language,
- name: str,
+ nlp: Optional[Language],
+ name: Optional[str] = None,
+ *,
patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
assign_as_span: bool = False,
alignment_mode: str = "expand",
@@ -74,9 +81,22 @@ def __init__(
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
include_assigned: bool = False,
+ label_name: Optional[str] = None,
+ label: Optional[str] = None,
+ span_setter: SpanSetterArg = {"ents": True},
):
- self.name = name
- self.nlp = nlp
+ if label is None and label_name is not None:
+ warnings.warn(
+ "`label_name` is deprecated, use `label` instead.",
+ DeprecationWarning,
+ )
+ label = label_name
+ if label is None:
+ raise ValueError("`label` parameter is required.")
+ self.label = label
+
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
+
self.attr = attr
self.assign_as_span = assign_as_span
self.ignore_excluded = ignore_excluded
@@ -201,8 +221,8 @@ def __init__(
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
+ def set_extensions(self) -> None:
+ super().set_extensions()
if not Span.has_extension("assigned"):
Span.set_extension("assigned", default=dict())
if not Span.has_extension("source"):
@@ -210,7 +230,7 @@ def set_extensions(cls) -> None:
def filter_one(self, span: Span) -> Span:
"""
- Filter extracted entity based on the "exclusion filter" mentionned
+ Filter extracted entity based on the "exclusion filter" mentioned
in the configuration
Parameters
@@ -336,7 +356,6 @@ def assign_one(self, span: Span) -> Span:
if replace_key is None and self.replace_key[source] is not None:
# There should have been a replacement, but none was found
# So we discard the entity
- yield from []
return
# Entity replacement
@@ -371,7 +390,7 @@ def assign_one(self, span: Span) -> Span:
for replaced in kept_ents:
# Propagating attributes from the anchor
replaced._.source = source
- replaced.label_ = self.name
+ replaced.label_ = self.label
else:
# Entity expansion
@@ -386,7 +405,7 @@ def assign_one(self, span: Span) -> Span:
)
span._.source = source
- span.label_ = self.name
+ span.label_ = self.label
kept_ents = [span]
key = "value_span" if self.assign_as_span else "value_text"
@@ -421,7 +440,7 @@ def process(self, doc: Doc) -> List[Span]:
"""
matches = self.phrase_matcher(doc, as_spans=True)
- regex_matches = self.regex_matcher(doc, as_spans=True)
+ regex_matches = list(self.regex_matcher(doc, as_spans=True))
spans = (*matches, *regex_matches)
for span in spans:
@@ -442,16 +461,6 @@ def __call__(self, doc: Doc) -> Doc:
spaCy Doc object, annotated for extracted terms.
"""
- ents = list(self.process(doc))
-
- doc.spans[self.name] = ents
-
- ents, discarded = filter_spans(list(doc.ents) + ents, return_discarded=True)
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
-
+ spans = list(self.process(doc))
+ self.set_spans(doc, spans)
return doc
diff --git a/edsnlp/pipelines/core/contextual_matcher/factory.py b/edsnlp/pipelines/core/contextual_matcher/factory.py
index bbee566ee..b4bc58762 100644
--- a/edsnlp/pipelines/core/contextual_matcher/factory.py
+++ b/edsnlp/pipelines/core/contextual_matcher/factory.py
@@ -1,76 +1,25 @@
-import re
-from typing import Any, Dict, List, Union
-
from spacy.language import Language
from edsnlp.pipelines.core.contextual_matcher import ContextualMatcher
from edsnlp.utils.deprecation import deprecated_factory
DEFAULT_CONFIG = dict(
+ assign_as_span=False,
+ alignment_mode="expand",
attr="NORM",
- ignore_excluded=True,
- ignore_space_tokens=False,
regex_flags=0,
- alignment_mode="expand",
- assign_as_span=False,
+ ignore_excluded=False,
+ ignore_space_tokens=False,
include_assigned=False,
+ label_name=None,
+ label=None,
+ span_setter={"ents": True},
)
-
-@deprecated_factory(
- "contextual-matcher", "eds.contextual-matcher", default_config=DEFAULT_CONFIG
-)
-@Language.factory("eds.contextual-matcher", default_config=DEFAULT_CONFIG)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
- assign_as_span: bool,
- alignment_mode: str,
- attr: str,
- ignore_excluded: bool,
- ignore_space_tokens: bool,
- regex_flags: Union[re.RegexFlag, int],
- include_assigned: bool,
-):
- """
- Allows additional matching in the surrounding context of the main match group,
- for qualification/filtering.
-
- Parameters
- ----------
- nlp : Language
- spaCy `Language` object.
- name : str
- The name of the pipe
- patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
- The configuration dictionary
- assign_as_span : bool
- Whether to store eventual extractions defined via the `assign` key as Spans
- or as string
- attr : str
- Attribute to match on, eg `TEXT`, `NORM`, etc.
- ignore_excluded : bool
- Whether to skip excluded tokens during matching.
- alignment_mode : str
- Overwrite alignment mode.
- regex_flags : Union[re.RegexFlag, int]
- RegExp flags to use when matching, filtering and assigning (See
- [here](https://docs.python.org/3/library/re.html#flags))
- include_assigned : bool
- Whether to include (eventual) assign matches to the final entity
-
- """
-
- return ContextualMatcher(
- nlp,
- name,
- patterns,
- assign_as_span,
- alignment_mode,
- attr=attr,
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- regex_flags=regex_flags,
- include_assigned=include_assigned,
- )
+create_component = deprecated_factory(
+ "contextual-matcher",
+ "eds.contextual-matcher",
+)(ContextualMatcher)
+create_component = Language.factory(
+ "eds.contextual-matcher",
+)(create_component)
diff --git a/edsnlp/pipelines/core/endlines/__init__.py b/edsnlp/pipelines/core/endlines/__init__.py
index 866c3f84e..e69de29bb 100644
--- a/edsnlp/pipelines/core/endlines/__init__.py
+++ b/edsnlp/pipelines/core/endlines/__init__.py
@@ -1 +0,0 @@
-from .endlines import EndLines, EndLinesModel
diff --git a/edsnlp/pipelines/core/endlines/endlines.py b/edsnlp/pipelines/core/endlines/endlines.py
index 9cb1ff1bf..56bfd0cfc 100644
--- a/edsnlp/pipelines/core/endlines/endlines.py
+++ b/edsnlp/pipelines/core/endlines/endlines.py
@@ -6,43 +6,128 @@
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
-from edsnlp.pipelines.core.matcher import GenericMatcher
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
from edsnlp.utils.filter import get_spans
-from .endlinesmodel import EndLinesModel
from .functional import build_path
+from .model import EndLinesModel
-class EndLines(GenericMatcher):
- """
- spaCy Pipeline to detect whether a newline character should
- be considered a space (ie introduced by the PDF).
+class EndLinesMatcher(GenericMatcher):
+ '''
+ The `eds.endlines` component classifies newline characters as actual end of lines
+ or mere spaces. In the latter case, the token is removed from the normalised
+ document.
- The pipeline will add the extension `end_line` to spans
- and tokens. The `end_line` attribute is a boolean or `None`,
- set to `True` if the pipeline predicts that the new line
- is an end line character. Otherwise, it is set to `False`
- if the new line is classified as a space. If no classification
- has been done over that token, it will remain `None`.
+ Behind the scenes, it uses a `endlinesmodel` instance, which is an unsupervised
+ algorithm based on the work of [@zweigenbaum2016].
+
+ Training
+ --------
+ ```python
+ import spacy
+ from edsnlp.pipelines.core.endlines.model import EndLinesModel
+
+ nlp = spacy.blank("eds")
+
+ texts = [
+ """
+ Le patient est arrivé hier soir.
+ Il est accompagné par son fils
+
+ ANTECEDENTS
+ Il a fait une TS en 2010
+ Fumeur, il est arreté il a 5 mois
+ Chirurgie de coeur en 2011
+ CONCLUSION
+ Il doit prendre
+ le medicament indiqué 3 fois par jour. Revoir médecin
+ dans 1 mois.
+ DIAGNOSTIC :
+
+ Antecedents Familiaux:
+ - 1. Père avec diabete
+ """,
+ """
+ J'aime le
+ fromage...
+ """,
+ ]
+
+ docs = list(nlp.pipe(texts))
+
+ # Train and predict an EndLinesModel
+ endlines = EndLinesModel(nlp=nlp)
+
+ df = endlines.fit_and_predict(docs)
+ df.head()
+
+ PATH = "/tmp/path_to_save"
+ endlines.save(PATH)
+ ```
+
+ Examples
+ --------
+ ```python
+ import spacy
+ from spacy.tokens import Span
+ from spacy import displacy
+
+ nlp = spacy.blank("eds")
+
+ PATH = "/tmp/path_to_save"
+ nlp.add_pipe("eds.endlines", config=dict(model_path=PATH))
+
+ docs = list(nlp.pipe(texts))
+
+ doc_exemple = docs[1]
+
+ doc_exemple.ents = tuple(
+ Span(doc_exemple, token.i, token.i + 1, "excluded")
+ for token in doc_exemple
+ if token.tag_ == "EXCLUDED"
+ )
+
+ displacy.render(doc_exemple, style="ent", options={"colors": {"space": "red"}})
+ ```
+
+ Extensions
+ ----------
+ The `eds.endlines` pipeline declares one extension, on both `Span` and `Token`
+ objects. The `end_line` attribute is a boolean, set to `True` if the pipeline
+ predicts that the new line is an end line character. Otherwise, it is set to
+ `False` if the new line is classified as a space.
+
+ The pipeline also sets the `excluded` custom attribute on newlines that are
+ classified as spaces. It lets downstream matchers skip excluded tokens
+ (see [normalisation](/pipelines/core/normalisation/)) for more detail.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
-
- end_lines_model : Optional[Union[str, EndLinesModel]], by default None
- path to trained model. If None, it will use a default model
- """
+ The pipeline object.
+ name: str
+ The name of the component.
+ model_path : Optional[Union[str, EndLinesModel]]
+ Path to trained model. If None, it will use a default model
+
+ Authors and citation
+ --------------------
+ The `eds.endlines` pipeline was developed by AP-HP's Data Science team based on
+ the work of [@zweigenbaum2016].
+ '''
def __init__(
self,
nlp: Language,
- end_lines_model: Optional[Union[str, EndLinesModel]],
- **kwargs,
+ name: Optional[str] = "eds.endlines",
+ *,
+ model_path: Optional[Union[str, EndLinesModel]] = None,
):
super().__init__(
- nlp,
+ nlp=nlp,
+ name=name,
terms=None,
attr="TEXT",
regex=dict(
@@ -50,10 +135,9 @@ def __init__(
),
ignore_excluded=False,
ignore_space_tokens=False,
- **kwargs,
)
- self._read_model(end_lines_model)
+ self._read_model(model_path)
def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
"""
@@ -70,10 +154,10 @@ def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]):
with open(path, "rb") as inp:
self.model = pickle.load(inp)
- elif type(end_lines_model) == str:
+ elif isinstance(end_lines_model, str):
with open(end_lines_model, "rb") as inp:
self.model = pickle.load(inp)
- elif type(end_lines_model) == EndLinesModel:
+ elif isinstance(end_lines_model, EndLinesModel):
self.model = end_lines_model
else:
raise TypeError(
diff --git a/edsnlp/pipelines/core/endlines/factory.py b/edsnlp/pipelines/core/endlines/factory.py
index f82b708a0..1c9b0c32e 100644
--- a/edsnlp/pipelines/core/endlines/factory.py
+++ b/edsnlp/pipelines/core/endlines/factory.py
@@ -1,17 +1,20 @@
-from typing import Optional
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from .endlines import EndLines
+from .endlines import EndLinesMatcher
+DEFAULT_CONFIG = dict(
+ model_path=None,
+)
-@deprecated_factory("endlines", "eds.endlines")
-@Language.factory("eds.endlines")
-def create_component(
- nlp: Language,
- name: str,
- model_path: Optional[str],
-):
- return EndLines(nlp, end_lines_model=model_path)
+create_component = EndLinesMatcher
+create_component = deprecated_factory(
+ "endlines",
+ "eds.endlines",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
+create_component = Language.factory(
+ "eds.endlines",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/endlines/endlinesmodel.py b/edsnlp/pipelines/core/endlines/model.py
similarity index 99%
rename from edsnlp/pipelines/core/endlines/endlinesmodel.py
rename to edsnlp/pipelines/core/endlines/model.py
index 950fa3363..6366f5ef5 100644
--- a/edsnlp/pipelines/core/endlines/endlinesmodel.py
+++ b/edsnlp/pipelines/core/endlines/model.py
@@ -520,7 +520,8 @@ def _predict_M2(self, B1: pd.Series, B2: pd.Series) -> Dict[str, Any]:
return outputs
def _fit_encoder_2S(self, S1: pd.Series, S2: pd.Series) -> OneHotEncoder:
- """Fit a one hot encoder with 2 Series. It concatenates the series and after it fits.
+ """Fit a one hot encoder with 2 Series. It concatenates the series and after it
+ fits.
Parameters
----------
@@ -685,7 +686,8 @@ def _shift_col(
@classmethod
def _get_attributes(cls, doc: Doc, i=0):
- """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame format.
+ """Function to get the attributes of tokens of a spacy doc in a pd.DataFrame
+ format.
Parameters
----------
diff --git a/edsnlp/pipelines/core/matcher/factory.py b/edsnlp/pipelines/core/matcher/factory.py
index b34147219..da8c4c98b 100644
--- a/edsnlp/pipelines/core/matcher/factory.py
+++ b/edsnlp/pipelines/core/matcher/factory.py
@@ -1,86 +1,27 @@
-from typing import Any, Dict, List, Optional, Union
-
from spacy.language import Language
-from edsnlp.pipelines.core.matcher import GenericMatcher
-from edsnlp.pipelines.core.matcher.matcher import GenericTermMatcher
from edsnlp.utils.deprecation import deprecated_factory
+from .matcher import GenericMatcher
+
DEFAULT_CONFIG = dict(
terms=None,
regex=None,
attr="TEXT",
ignore_excluded=False,
ignore_space_tokens=False,
- term_matcher=GenericTermMatcher.exact,
+ term_matcher="exact",
term_matcher_config={},
+ span_setter={"ents": True},
)
-
-@deprecated_factory(
+create_component = GenericMatcher
+create_component = deprecated_factory(
"matcher",
"eds.matcher",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-@Language.factory(
- "eds.matcher", default_config=DEFAULT_CONFIG, assigns=["doc.ents", "doc.spans"]
-)
-def create_component(
- nlp: Language,
- name: str = "eds.matcher",
- terms: Optional[Dict[str, Union[str, List[str]]]] = None,
- attr: Union[str, Dict[str, str]] = None,
- regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT",
- ignore_excluded: bool = False,
- ignore_space_tokens: bool = False,
- term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
- term_matcher_config: Dict[str, Any] = {},
-):
- """
- Provides a generic matcher component.
-
- Parameters
- ----------
- nlp : Language
- The spaCy object.
- name: str
- The name of the component.
- terms : Optional[Patterns]
- A dictionary of terms.
- regex : Optional[Patterns]
- A dictionary of regular expressions.
- attr : str
- The default attribute to use for matching.
- Can be overridden using the `terms` and `regex` configurations.
- ignore_excluded : bool
- Whether to skip excluded tokens (requires an upstream
- pipeline to mark excluded tokens).
- ignore_space_tokens: bool
- Whether to skip space tokens during matching.
-
- You won't be able to match on newlines if this is enabled and
- the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
- term_matcher: GenericTermMatcher
- The matcher to use for matching phrases ?
- One of (exact, simstring)
- term_matcher_config: Dict[str,Any]
- Parameters of the matcher class
- """
- assert not (terms is None and regex is None)
-
- if terms is None:
- terms = dict()
- if regex is None:
- regex = dict()
-
- return GenericMatcher(
- nlp,
- terms=terms,
- attr=attr,
- regex=regex,
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- term_matcher=term_matcher,
- term_matcher_config=term_matcher_config,
- )
+)(create_component)
+create_component = Language.factory(
+ "eds.matcher",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/matcher/matcher.py b/edsnlp/pipelines/core/matcher/matcher.py
index 8fc552bbb..6998d5133 100644
--- a/edsnlp/pipelines/core/matcher/matcher.py
+++ b/edsnlp/pipelines/core/matcher/matcher.py
@@ -1,30 +1,67 @@
-from enum import Enum
from typing import Any, Dict, List, Optional
from spacy.language import Language
from spacy.tokens import Doc, Span
+from typing_extensions import Literal
from edsnlp.matchers.phrase import EDSPhraseMatcher
from edsnlp.matchers.regex import RegexMatcher
from edsnlp.matchers.simstring import SimstringMatcher
from edsnlp.matchers.utils import Patterns
-from edsnlp.pipelines.base import BaseComponent
-from edsnlp.utils.filter import filter_spans
+from edsnlp.pipelines.base import BaseNERComponent, SpanSetterArg
-class GenericTermMatcher(str, Enum):
- exact = "exact"
- simstring = "simstring"
+class GenericMatcher(BaseNERComponent):
+ r"""
+ EDS-NLP simplifies the matching process by exposing a `eds.matcher` component
+ that can match on terms or regular expressions.
+ Examples
+ --------
+ Let us redefine the pipeline :
-class GenericMatcher(BaseComponent):
- """
- Provides a generic matcher component.
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+
+ terms = dict(
+ covid=["coronavirus", "covid19"], # (1)
+ patient="patient", # (2)
+ )
+
+ regex = dict(
+ covid=r"coronavirus|covid[-\s]?19|sars[-\s]cov[-\s]2", # (3)
+ )
+
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(
+ terms=terms,
+ regex=regex,
+ attr="LOWER",
+ term_matcher="exact",
+ term_matcher_config={},
+ ),
+ )
+ ```
+
+ 1. Every key in the `terms` dictionary is mapped to a concept.
+ 2. The `eds.matcher` pipeline expects a list of expressions, or a single expression.
+ 3. We can also define regular expression patterns.
+
+ This snippet is complete, and should run as is.
+
+ Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become
+ the label of the extracted entities. Dictionary values are either a single
+ expression or a list of expressions that match the concept.
Parameters
----------
nlp : Language
- The spaCy object.
+ The pipeline object.
+ name: str
+ The name of the component.
terms : Optional[Patterns]
A dictionary of terms.
regex : Optional[Patterns]
@@ -40,44 +77,60 @@ class GenericMatcher(BaseComponent):
You won't be able to match on newlines if this is enabled and
the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
- term_matcher: GenericTermMatcher
+ term_matcher : Literal["exact", "simstring"]
The matcher to use for matching phrases ?
One of (exact, simstring)
- term_matcher_config: Dict[str,Any]
+ term_matcher_config : Dict[str,Any]
Parameters of the matcher class
+ span_setter : SpanSetterArg
+ How to set the spans in the doc.
+
+ Authors and citation
+ --------------------
+ The `eds.matcher` pipeline was developed by AP-HP's Data Science team.
"""
def __init__(
self,
nlp: Language,
- terms: Optional[Patterns],
- regex: Optional[Patterns],
- attr: str,
- ignore_excluded: bool,
+ name: Optional[str] = "eds.matcher",
+ *,
+ terms: Optional[Patterns] = None,
+ regex: Optional[Patterns] = None,
+ attr: str = "TEXT",
+ ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
- term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
- term_matcher_config: Dict[str, Any] = None,
+ term_matcher: Literal["exact", "simstring"] = "exact",
+ term_matcher_config: Dict[str, Any] = {},
+ span_setter: SpanSetterArg = {"ents": True},
):
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
+
+ if terms is None and regex is None:
+ raise ValueError(
+ "You must provide either `terms` or `regex` to the matcher."
+ )
- self.nlp = nlp
+ terms = terms or {}
+ regex = regex or {}
self.attr = attr
- if term_matcher == GenericTermMatcher.exact:
+ if term_matcher == "exact":
self.phrase_matcher = EDSPhraseMatcher(
self.nlp.vocab,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
- **(term_matcher_config or {}),
+ **term_matcher_config,
)
- elif term_matcher == GenericTermMatcher.simstring:
+ elif term_matcher == "simstring":
self.phrase_matcher = SimstringMatcher(
self.nlp.vocab,
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
- **(term_matcher_config or {}),
+ **term_matcher_config,
)
else:
raise ValueError(
@@ -134,17 +187,6 @@ def __call__(self, doc: Doc) -> Doc:
"""
matches = self.process(doc)
- for span in matches:
- if span.label_ not in doc.spans:
- doc.spans[span.label_] = []
- doc.spans[span.label_].append(span)
-
- ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
+ self.set_spans(doc, matches)
return doc
diff --git a/edsnlp/pipelines/core/normalizer/__init__.py b/edsnlp/pipelines/core/normalizer/__init__.py
index 96f0cb1ce..59d71c140 100644
--- a/edsnlp/pipelines/core/normalizer/__init__.py
+++ b/edsnlp/pipelines/core/normalizer/__init__.py
@@ -1,11 +1,4 @@
-from functools import lru_cache
-from typing import Union
-
-from spacy.tokens import Doc, Span, Token
-
-from .accents import Accents
-from .pollution import Pollution
-from .quotes import Quotes
+from spacy.tokens import Token
if not Token.has_extension("excluded"):
Token.set_extension("excluded", default=False)
diff --git a/edsnlp/pipelines/core/normalizer/accents/__init__.py b/edsnlp/pipelines/core/normalizer/accents/__init__.py
index 8ea75cb92..e69de29bb 100644
--- a/edsnlp/pipelines/core/normalizer/accents/__init__.py
+++ b/edsnlp/pipelines/core/normalizer/accents/__init__.py
@@ -1 +0,0 @@
-from .accents import Accents
diff --git a/edsnlp/pipelines/core/normalizer/accents/accents.py b/edsnlp/pipelines/core/normalizer/accents/accents.py
index 7d6f68d15..25e25e984 100644
--- a/edsnlp/pipelines/core/normalizer/accents/accents.py
+++ b/edsnlp/pipelines/core/normalizer/accents/accents.py
@@ -1,24 +1,34 @@
from typing import List, Optional, Tuple
+from spacy import Language
from spacy.tokens import Doc
from . import patterns
-class Accents(object):
+class AccentsConverter(object):
"""
Normalises accents, using a same-length strategy.
Parameters
----------
+ nlp : Optional[Language]
+ The pipeline object.
+ name : Optional[str]
+ The component name.
accents : List[Tuple[str, str]]
List of accentuated characters and their transcription.
"""
- def __init__(self, accents: Optional[List[Tuple[str, str]]]) -> None:
- if accents is None:
- accents = patterns.accents
-
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: Optional[str] = "eds.spaces",
+ *,
+ accents: List[Tuple[str, str]] = patterns.accents,
+ ) -> None:
+ self.nlp = nlp
+ self.name = name
self.translation_table = str.maketrans(
"".join(accent_group for accent_group, _ in accents),
"".join(rep * len(accent_group) for accent_group, rep in accents),
diff --git a/edsnlp/pipelines/core/normalizer/accents/factory.py b/edsnlp/pipelines/core/normalizer/accents/factory.py
index 8a8564e7f..051bd6710 100644
--- a/edsnlp/pipelines/core/normalizer/accents/factory.py
+++ b/edsnlp/pipelines/core/normalizer/accents/factory.py
@@ -1,29 +1,21 @@
-from typing import List, Optional, Tuple
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from .accents import Accents
+from . import patterns
+from .accents import AccentsConverter
DEFAULT_CONFIG = dict(
- accents=None,
+ accents=patterns.accents,
)
-
-@deprecated_factory(
- "accents", "eds.accents", default_config=DEFAULT_CONFIG, assigns=["token.norm"]
-)
-@Language.factory(
+create_component = AccentsConverter
+create_component = deprecated_factory(
+ "accents",
"eds.accents",
- default_config=DEFAULT_CONFIG,
assigns=["token.norm"],
-)
-def create_component(
- nlp: Language,
- name: str,
- accents: Optional[List[Tuple[str, str]]],
-):
- return Accents(
- accents=accents,
- )
+)(create_component)
+create_component = Language.factory(
+ "eds.accents",
+ assigns=["token.norm"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/normalizer/factory.py b/edsnlp/pipelines/core/normalizer/factory.py
index 5fc7e2034..0d7804d8c 100644
--- a/edsnlp/pipelines/core/normalizer/factory.py
+++ b/edsnlp/pipelines/core/normalizer/factory.py
@@ -1,15 +1,15 @@
from typing import Any, Dict, Union
-from spacy import registry
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from .accents.factory import DEFAULT_CONFIG as accents_config
+from .accents.accents import AccentsConverter
from .normalizer import Normalizer
-from .pollution.factory import DEFAULT_CONFIG as pollution_config
-from .quotes.factory import DEFAULT_CONFIG as quotes_config
-from .spaces.factory import DEFAULT_CONFIG as spaces_config
+from .pollution.patterns import default_enabled as default_enabled_pollution
+from .pollution.pollution import PollutionTagger
+from .quotes.quotes import QuotesConverter
+from .spaces.spaces import SpacesTagger
DEFAULT_CONFIG = dict(
accents=True,
@@ -23,15 +23,13 @@
@deprecated_factory(
"normalizer",
"eds.normalizer",
- default_config=DEFAULT_CONFIG,
assigns=["token.norm", "token.tag"],
)
-@Language.factory(
- "eds.normalizer", default_config=DEFAULT_CONFIG, assigns=["token.norm", "token.tag"]
-)
+@Language.factory("eds.normalizer", assigns=["token.norm", "token.tag"])
def create_component(
nlp: Language,
name: str = "eds.normalizer",
+ *,
accents: Union[bool, Dict[str, Any]] = True,
lowercase: Union[bool, Dict[str, Any]] = True,
quotes: Union[bool, Dict[str, Any]] = True,
@@ -50,6 +48,10 @@ def create_component(
Parameters
----------
+ nlp: Language
+ The pipeline object.
+ name : str
+ The component name.
lowercase : bool
Whether to remove case.
accents : Union[bool, Dict[str, Any]]
@@ -63,32 +65,39 @@ def create_component(
"""
if accents:
- config = dict(**accents_config)
- if isinstance(accents, dict):
- config.update(accents)
- accents = registry.get("factories", "eds.accents")(nlp, "eds.accents", **config)
+ accents = AccentsConverter(
+ nlp=nlp,
+ name="eds.accents",
+ **(accents if accents is not True else {}),
+ )
if quotes:
- config = dict(**quotes_config)
- if isinstance(quotes, dict):
- config.update(quotes)
- quotes = registry.get("factories", "eds.quotes")(nlp, "eds.quotes", **config)
+ quotes = QuotesConverter(
+ nlp=nlp,
+ name="eds.quotes",
+ **(quotes if quotes is not True else {}),
+ )
if spaces:
- config = dict(**spaces_config)
- if isinstance(spaces, dict):
- config.update(spaces)
- spaces = registry.get("factories", "eds.spaces")(nlp, "eds.spaces", **config)
+ spaces = SpacesTagger(
+ nlp=nlp,
+ name="eds.spaces",
+ **(spaces if spaces is not True else {}),
+ )
if pollution:
- config = dict(**pollution_config["pollution"])
+ config = dict(default_enabled_pollution)
if isinstance(pollution, dict):
config.update(pollution)
- pollution = registry.get("factories", "eds.pollution")(
- nlp, "eds.pollution", pollution=config
+ pollution = PollutionTagger(
+ nlp=nlp,
+ name="eds.pollution",
+ pollution=config,
)
normalizer = Normalizer(
+ nlp=nlp,
+ name=name,
lowercase=lowercase,
accents=accents or None,
quotes=quotes or None,
diff --git a/edsnlp/pipelines/core/normalizer/lowercase/__init__.py b/edsnlp/pipelines/core/normalizer/lowercase/__init__.py
deleted file mode 100644
index c6625ff98..000000000
--- a/edsnlp/pipelines/core/normalizer/lowercase/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .factory import remove_lowercase
diff --git a/edsnlp/pipelines/core/normalizer/lowercase/factory.py b/edsnlp/pipelines/core/normalizer/lowercase/factory.py
deleted file mode 100644
index 5205db840..000000000
--- a/edsnlp/pipelines/core/normalizer/lowercase/factory.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from spacy.language import Language
-from spacy.tokens import Doc
-
-
-@Language.component("remove-lowercase", assigns=["token.norm"])
-@Language.component("eds.remove-lowercase", assigns=["token.norm"])
-def remove_lowercase(doc: Doc):
- """
- Add case on the `NORM` custom attribute. Should always be applied first.
-
- Parameters
- ----------
- doc : Doc
- The spaCy `Doc` object.
-
- Returns
- -------
- Doc
- The document, with case put back in `NORM`.
- """
-
- for token in doc:
- token.norm_ = token.text
-
- return doc
diff --git a/edsnlp/pipelines/core/normalizer/normalizer.py b/edsnlp/pipelines/core/normalizer/normalizer.py
index 5dd5f8a96..3d598c572 100644
--- a/edsnlp/pipelines/core/normalizer/normalizer.py
+++ b/edsnlp/pipelines/core/normalizer/normalizer.py
@@ -1,12 +1,13 @@
from typing import Optional
+from spacy import Language
from spacy.tokens import Doc
-from .accents import Accents
-from .lowercase import remove_lowercase
-from .pollution import Pollution
-from .quotes import Quotes
-from .spaces import Spaces
+from .accents.accents import AccentsConverter
+from .pollution.pollution import PollutionTagger
+from .quotes.quotes import QuotesConverter
+from .remove_lowercase.factory import remove_lowercase
+from .spaces.spaces import SpacesTagger
class Normalizer(object):
@@ -22,6 +23,10 @@ class Normalizer(object):
Parameters
----------
+ nlp : Optional[Language]
+ The pipeline object.
+ name : Optional[str]
+ The name of the component.
lowercase : bool
Whether to remove case.
accents : Optional[Accents]
@@ -36,12 +41,17 @@ class Normalizer(object):
def __init__(
self,
- lowercase: bool,
- accents: Optional[Accents],
- quotes: Optional[Quotes],
- spaces: Optional[Spaces],
- pollution: Optional[Pollution],
+ nlp: Optional[Language],
+ name: Optional[str] = "eds.normalizer",
+ *,
+ lowercase: bool = False,
+ accents: Optional[AccentsConverter] = None,
+ quotes: Optional[QuotesConverter] = None,
+ spaces: Optional[SpacesTagger] = None,
+ pollution: Optional[PollutionTagger] = None,
):
+ self.nlp = nlp
+ self.name = name
self.lowercase = lowercase
self.accents = accents
self.quotes = quotes
diff --git a/edsnlp/pipelines/core/normalizer/pollution/__init__.py b/edsnlp/pipelines/core/normalizer/pollution/__init__.py
index 01df9833b..e69de29bb 100644
--- a/edsnlp/pipelines/core/normalizer/pollution/__init__.py
+++ b/edsnlp/pipelines/core/normalizer/pollution/__init__.py
@@ -1 +0,0 @@
-from .pollution import Pollution
diff --git a/edsnlp/pipelines/core/normalizer/pollution/factory.py b/edsnlp/pipelines/core/normalizer/pollution/factory.py
index 11ea82e5f..a58b6d4e3 100644
--- a/edsnlp/pipelines/core/normalizer/pollution/factory.py
+++ b/edsnlp/pipelines/core/normalizer/pollution/factory.py
@@ -1,34 +1,21 @@
-from typing import Dict, List, Optional, Union
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from . import Pollution
+from .patterns import default_enabled
+from .pollution import PollutionTagger
DEFAULT_CONFIG = dict(
- pollution=dict(
- information=True,
- bars=True,
- biology=False,
- doctors=True,
- web=True,
- coding=False,
- footer=True,
- ),
+ pollution=default_enabled,
)
-
-@deprecated_factory(
- "pollution", "eds.pollution", default_config=DEFAULT_CONFIG, assigns=["token.tag"]
-)
-@Language.factory("eds.pollution", default_config=DEFAULT_CONFIG, assigns=["token.tag"])
-def create_component(
- nlp: Language,
- name: str,
- pollution: Optional[Dict[str, Union[bool, str, List[str]]]],
-):
- return Pollution(
- nlp,
- pollution=pollution,
- )
+create_component = PollutionTagger
+create_component = deprecated_factory(
+ "pollution",
+ "eds.pollution",
+ assigns=["doc.spans"],
+)(create_component)
+create_component = Language.factory(
+ "eds.pollution",
+ assigns=["doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/normalizer/pollution/patterns.py b/edsnlp/pipelines/core/normalizer/pollution/patterns.py
index bdaf4c60d..ad5ae5100 100644
--- a/edsnlp/pipelines/core/normalizer/pollution/patterns.py
+++ b/edsnlp/pipelines/core/normalizer/pollution/patterns.py
@@ -49,3 +49,13 @@
coding=coding,
footer=footer,
)
+
+default_enabled = dict(
+ information=True,
+ bars=True,
+ biology=False,
+ doctors=True,
+ web=True,
+ coding=False,
+ footer=True,
+)
diff --git a/edsnlp/pipelines/core/normalizer/pollution/pollution.py b/edsnlp/pipelines/core/normalizer/pollution/pollution.py
index 1315acb90..0dade709c 100644
--- a/edsnlp/pipelines/core/normalizer/pollution/pollution.py
+++ b/edsnlp/pipelines/core/normalizer/pollution/pollution.py
@@ -9,9 +9,10 @@
from edsnlp.utils.filter import filter_spans
from . import patterns
+from .patterns import default_enabled
-class Pollution(BaseComponent):
+class PollutionTagger(BaseComponent):
"""
Tags pollution tokens.
@@ -26,7 +27,9 @@ class Pollution(BaseComponent):
Parameters
----------
nlp : Language
- Language pipeline object
+ The pipeline object
+ name : Optional[str]
+ The component name.
pollution : Dict[str, Union[str, List[str]]]
Dictionary containing regular expressions of pollution.
"""
@@ -35,14 +38,15 @@ class Pollution(BaseComponent):
def __init__(
self,
nlp: Language,
- pollution: Optional[Dict[str, Union[bool, str, List[str]]]],
+ name: Optional[str] = "eds.pollution",
+ *,
+ pollution: Dict[str, Union[bool, str, List[str]]] = default_enabled,
):
self.nlp = nlp
+ self.name = name
self.nlp.vocab.strings.add("EXCLUDED")
- if pollution is None:
- pollution = {k: True for k in patterns.pollution.keys()}
self.pollution = dict()
for k, v in pollution.items():
diff --git a/edsnlp/pipelines/core/normalizer/quotes/__init__.py b/edsnlp/pipelines/core/normalizer/quotes/__init__.py
index 69d09dcee..e69de29bb 100644
--- a/edsnlp/pipelines/core/normalizer/quotes/__init__.py
+++ b/edsnlp/pipelines/core/normalizer/quotes/__init__.py
@@ -1 +0,0 @@
-from .quotes import Quotes
diff --git a/edsnlp/pipelines/core/normalizer/quotes/factory.py b/edsnlp/pipelines/core/normalizer/quotes/factory.py
index e1dd4d9f3..49db9970d 100644
--- a/edsnlp/pipelines/core/normalizer/quotes/factory.py
+++ b/edsnlp/pipelines/core/normalizer/quotes/factory.py
@@ -1,29 +1,21 @@
-from typing import List, Optional, Tuple
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from .quotes import Quotes
+from .patterns import quotes_and_apostrophes
+from .quotes import QuotesConverter
DEFAULT_CONFIG = dict(
- quotes=None,
+ quotes=quotes_and_apostrophes,
)
-
-@deprecated_factory(
- "quotes", "eds.quotes", default_config=DEFAULT_CONFIG, assigns=["token.norm"]
-)
-@Language.factory(
+create_component = QuotesConverter
+create_component = deprecated_factory(
+ "quotes",
"eds.quotes",
- default_config=DEFAULT_CONFIG,
assigns=["token.norm"],
-)
-def create_component(
- nlp: Language,
- name: str,
- quotes: Optional[List[Tuple[str, str]]],
-):
- return Quotes(
- quotes=quotes,
- )
+)(create_component)
+create_component = Language.factory(
+ "eds.quotes",
+ assigns=["token.norm"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/normalizer/quotes/quotes.py b/edsnlp/pipelines/core/normalizer/quotes/quotes.py
index 25c696e86..33249fb29 100644
--- a/edsnlp/pipelines/core/normalizer/quotes/quotes.py
+++ b/edsnlp/pipelines/core/normalizer/quotes/quotes.py
@@ -1,24 +1,35 @@
from typing import List, Optional, Tuple
+from spacy import Language
from spacy.tokens import Doc
from .patterns import quotes_and_apostrophes
-class Quotes(object):
+class QuotesConverter:
"""
We normalise quotes, following this
`source `_.
Parameters
----------
+ nlp : Optional[Language]
+ The pipeline object.
+ name : Optional[str]
+ The component name.
quotes : List[Tuple[str, str]]
List of quotation characters and their transcription.
"""
- def __init__(self, quotes: Optional[List[Tuple[str, str]]]) -> None:
- if quotes is None:
- quotes = quotes_and_apostrophes
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: Optional[str] = "eds.spaces",
+ *,
+ quotes: List[Tuple[str, str]] = quotes_and_apostrophes
+ ) -> None:
+ self.nlp = nlp
+ self.name = name
self.translation_table = str.maketrans(
"".join(quote_group for quote_group, _ in quotes),
diff --git a/edsnlp/pipelines/ner/disorders/AIDS/__init__.py b/edsnlp/pipelines/core/normalizer/remove_lowercase/__init__.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/AIDS/__init__.py
rename to edsnlp/pipelines/core/normalizer/remove_lowercase/__init__.py
diff --git a/edsnlp/pipelines/core/normalizer/remove_lowercase/factory.py b/edsnlp/pipelines/core/normalizer/remove_lowercase/factory.py
new file mode 100644
index 000000000..e1018aed2
--- /dev/null
+++ b/edsnlp/pipelines/core/normalizer/remove_lowercase/factory.py
@@ -0,0 +1,47 @@
+from spacy.language import Language
+from spacy.tokens import Doc
+
+from edsnlp.utils.deprecation import deprecated_factory
+
+
+def remove_lowercase(doc: Doc):
+ """
+ Add case on the `NORM` custom attribute. Should always be applied first.
+
+ Parameters
+ ----------
+ doc : Doc
+ The spaCy `Doc` object.
+
+ Returns
+ -------
+ Doc
+ The document, with case put back in `NORM`.
+ """
+
+ for token in doc:
+ token.norm_ = token.text
+
+ return doc
+
+
+@deprecated_factory("remove-lowercase", "eds.remove_lowercase", assigns=["token.norm"])
+@deprecated_factory(
+ "eds.remove-lowercase", "eds.remove_lowercase", assigns=["token.norm"]
+)
+@Language.factory("eds.remove_lowercase", assigns=["token.norm"])
+def create_component(
+ nlp: Language,
+ name: str,
+):
+ """
+ Add case on the `NORM` custom attribute. Should always be applied first.
+
+ Parameters
+ ----------
+ nlp : Language
+ The pipeline object.
+ name : str
+ The name of the component.
+ """
+ return remove_lowercase # pragma: no cover
diff --git a/edsnlp/pipelines/core/normalizer/spaces/__init__.py b/edsnlp/pipelines/core/normalizer/spaces/__init__.py
index 4ebc2bea2..e69de29bb 100644
--- a/edsnlp/pipelines/core/normalizer/spaces/__init__.py
+++ b/edsnlp/pipelines/core/normalizer/spaces/__init__.py
@@ -1 +0,0 @@
-from .spaces import Spaces
diff --git a/edsnlp/pipelines/core/normalizer/spaces/factory.py b/edsnlp/pipelines/core/normalizer/spaces/factory.py
index 605c4681b..e90133684 100644
--- a/edsnlp/pipelines/core/normalizer/spaces/factory.py
+++ b/edsnlp/pipelines/core/normalizer/spaces/factory.py
@@ -1,29 +1,18 @@
from spacy.language import Language
-from .spaces import Spaces
+from edsnlp.utils.deprecation import deprecated_factory
-DEFAULT_CONFIG = dict()
+from .spaces import SpacesTagger
+DEFAULT_CONFIG = dict(newline=True)
-@Language.factory(
+create_component = SpacesTagger
+create_component = deprecated_factory(
+ "spaces",
"eds.spaces",
- default_config=DEFAULT_CONFIG,
assigns=["token.tag"],
-)
-def create_component(
- nlp: Language,
- name: str,
- newline: bool = True,
-):
- """
- Create a new component to update the `tag_` attribute of tokens.
-
- We assign "SPACE" to `token.tag` to be used by optimized components
- such as the EDSPhraseMatcher
-
- Parameters
- ----------
- newline : bool
- Whether to update the newline tokens too
- """
- return Spaces(newline=newline)
+)(create_component)
+create_component = Language.factory(
+ "eds.spaces",
+ assigns=["token.tag"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/normalizer/spaces/spaces.py b/edsnlp/pipelines/core/normalizer/spaces/spaces.py
index dc333e8fe..3b704634e 100644
--- a/edsnlp/pipelines/core/normalizer/spaces/spaces.py
+++ b/edsnlp/pipelines/core/normalizer/spaces/spaces.py
@@ -1,18 +1,33 @@
+from typing import Optional
+
+from spacy import Language
from spacy.tokens import Doc
-class Spaces(object):
+class SpacesTagger:
"""
We assign "SPACE" to `token.tag` to be used by optimized components
such as the EDSPhraseMatcher
Parameters
----------
+ nlp : Optional[Language]
+ The pipeline object.
+ name : Optional[str]
+ The component name.
newline : bool
Whether to update the newline tokens too
"""
- def __init__(self, newline: bool) -> None:
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: Optional[str] = "eds.spaces",
+ *,
+ newline: bool = True,
+ ) -> None:
+ self.nlp = nlp
+ self.name = name
self.newline = newline
def __call__(self, doc: Doc) -> Doc:
diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py
index 5f7731a5f..ac92c1d13 100644
--- a/edsnlp/pipelines/core/sentences/factory.py
+++ b/edsnlp/pipelines/core/sentences/factory.py
@@ -16,23 +16,101 @@
@deprecated_factory(
"sentences",
"eds.sentences",
- default_config=DEFAULT_CONFIG,
assigns=["token.is_sent_start"],
)
@Language.factory(
"eds.sentences",
- default_config=DEFAULT_CONFIG,
assigns=["token.is_sent_start"],
)
def create_component(
nlp: Language,
- name: str,
- punct_chars: Optional[List[str]],
- use_endlines: Optional[bool],
- ignore_excluded: bool,
+ name: str = "eds.sentences",
+ *,
+ punct_chars: Optional[List[str]] = None,
+ use_endlines: Optional[bool] = True,
+ ignore_excluded: bool = None,
):
+ r'''
+ The `eds.sentences` matcher provides an alternative to spaCy's default
+ `sentencizer`, aiming to overcome some of its limitations.
+
+ Indeed, the `sentencizer` merely looks at period characters to detect the end of a
+ sentence, a strategy that often fails in a clinical note settings. Our
+ `eds.sentences` component also classifies end-of-lines as sentence boundaries if
+ the subsequent token begins with an uppercase character, leading to slightly better
+ performances.
+
+ Moreover, the `eds.sentences` component use the output of the `eds.normalizer`
+ and `eds.endlines` output by default when these components are added to the
+ pipeline.
+
+ Examples
+ --------
+ === "EDS-NLP"
+
+ ```{ .python .no-check }
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+
+ text = """Le patient est admis le 23 août 2021 pour une douleur à l'estomac
+ Il lui était arrivé la même chose il y a deux ans."
+ """
+
+ doc = nlp(text)
+
+ for sentence in doc.sents:
+ print("", sentence, "")
+ # Out: Le patient est admis le 23 août 2021 pour une douleur à l'estomac
+ # Out: <\s>
+ # Out: Il lui était arrivé la même chose il y a deux ans. <\s>
+ ```
+
+ === "spaCy sentencizer"
+
+ ```{ .python .no-check }
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("sentencizer")
+
+ text = """Le patient est admis le 23 août 2021 pour une douleur à l'estomac"
+ Il lui était arrivé la même chose il y a deux ans.
+ """
+
+ doc = nlp(text)
+
+ for sentence in doc.sents:
+ print("", sentence, "")
+ # Out: Le patient est admis le 23 août 2021 pour une douleur à l'estomac
+ # Out: Il lui était arrivé la même chose il y a deux ans. <\s>
+ ```
+
+ Notice how EDS-NLP's implementation is more robust to ill-defined sentence endings.
+
+
+ Parameters
+ ----------
+ nlp: Language
+ The pipeline object.
+ name: str
+ The name of the component.
+ punct_chars : Optional[List[str]]
+ Punctuation characters.
+ use_endlines : bool
+ Whether to use endlines prediction.
+ ignore_excluded : bool
+ Whether to skip excluded tokens (requires the upstream `eds.normalizer` pipe).
+
+ Authors and citation
+ --------------------
+ The `eds.sentences` component was developed by AP-HP's Data Science team.
+ '''
+
return SentenceSegmenter(
- nlp.vocab,
+ nlp=nlp,
+ name=name,
punct_chars=punct_chars,
use_endlines=use_endlines,
ignore_excluded=ignore_excluded,
diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd
index 531c55830..3564559b8 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pxd
+++ b/edsnlp/pipelines/core/sentences/sentences.pxd
@@ -6,6 +6,7 @@ from spacy.typedefs cimport attr_t
cdef class SentenceSegmenter(object):
+ cdef str name
cdef bool ignore_excluded
cdef attr_t newline_hash
cdef attr_t excluded_hash
diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx
index 7123e72eb..f8d3be311 100644
--- a/edsnlp/pipelines/core/sentences/sentences.pyx
+++ b/edsnlp/pipelines/core/sentences/sentences.pyx
@@ -1,43 +1,30 @@
+from spacy import Language
from typing import Iterable, List, Optional
from libcpp cimport bool
-# from spacy.typedefs cimport attr_t
from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from spacy.lexeme cimport Lexeme
from spacy.tokens.doc cimport Doc
from spacy.tokens.token cimport TokenC
-from spacy.vocab cimport Vocab
from .terms import punctuation
-
-cdef class SentenceSegmenter(object):
- """
- Segments the Doc into sentences using a rule-based strategy,
- specific to AP-HP documents.
-
- Applies the same rule-based pipeline as spaCy's sentencizer,
- and adds a simple rule on the new lines : if a new line is followed by a
- capitalised word, then it is also an end of sentence.
-
- DOCS: https://spacy.io/api/sentencizer
-
- Arguments
- ---------
- punct_chars : Optional[List[str]]
- Punctuation characters.
- use_endlines : bool
- Whether to use endlines prediction.
- """
-
+cdef class SentenceSegmenter:
def __init__(
- self,
- vocab: Vocab,
- punct_chars: Optional[List[str]],
- use_endlines: bool,
- ignore_excluded: bool = True,
+ self,
+ nlp: Language,
+ name: Optional[str] = None,
+ *,
+ punct_chars: Optional[List[str]],
+ use_endlines: bool,
+ ignore_excluded: bool = True,
):
+ if isinstance(nlp, Language):
+ vocab = nlp.vocab
+ else:
+ vocab = nlp
+ self.name = name
if punct_chars is None:
punct_chars = punctuation
@@ -47,10 +34,14 @@ cdef class SentenceSegmenter(object):
self.excluded_hash = vocab.strings["EXCLUDED"]
self.endline_hash = vocab.strings["ENDLINE"]
self.punct_chars_hash = {vocab.strings[c] for c in punct_chars}
- self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")}
+ self.capitalized_shapes_hash = {
+ vocab.strings[shape]
+ for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")
+ }
if use_endlines:
- print("The use_endlines is deprecated and has been replaced by the ignore_excluded parameter")
+ print("The use_endlines is deprecated and has been replaced by the "
+ "ignore_excluded parameter")
def __call__(self, doc: Doc):
self.process(doc)
@@ -86,20 +77,35 @@ cdef class SentenceSegmenter(object):
if self.ignore_excluded and token.tag == self.excluded_hash:
continue
- is_in_punct_chars = self.punct_chars_hash.const_find(token.lex.orth) != self.punct_chars_hash.const_end()
- is_newline = Lexeme.c_check_flag(token.lex, IS_SPACE) and token.lex.orth == self.newline_hash
+ is_in_punct_chars = (
+ self.punct_chars_hash.const_find(token.lex.orth)
+ != self.punct_chars_hash.const_end()
+ )
+ is_newline = (
+ Lexeme.c_check_flag(token.lex, IS_SPACE)
+ and token.lex.orth == self.newline_hash
+ )
if seen_period or seen_newline:
if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT):
continue
- if is_in_punct_chars or is_newline or Lexeme.c_check_flag(token.lex, IS_PUNCT):
+ if (
+ is_in_punct_chars
+ or is_newline
+ or Lexeme.c_check_flag(token.lex, IS_PUNCT)
+ ):
continue
if seen_period:
doc.c[i].sent_start = 1
seen_newline = False
seen_period = False
else:
- doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
+ doc.c[i].sent_start = (
+ 1 if (
+ self.capitalized_shapes_hash.const_find(token.lex.shape)
+ != self.capitalized_shapes_hash.const_end()
+ ) else -1
+ )
seen_newline = False
seen_period = False
elif is_in_punct_chars:
diff --git a/edsnlp/pipelines/core/terminology/__init__.py b/edsnlp/pipelines/core/terminology/__init__.py
index 026bac2c7..e69de29bb 100644
--- a/edsnlp/pipelines/core/terminology/__init__.py
+++ b/edsnlp/pipelines/core/terminology/__init__.py
@@ -1 +0,0 @@
-from .terminology import TerminologyMatcher, TerminologyTermMatcher
diff --git a/edsnlp/pipelines/core/terminology/factory.py b/edsnlp/pipelines/core/terminology/factory.py
index 28362b5a0..6ba2a3cb6 100644
--- a/edsnlp/pipelines/core/terminology/factory.py
+++ b/edsnlp/pipelines/core/terminology/factory.py
@@ -1,80 +1,27 @@
-from typing import Any, Dict, List, Optional, Union
-
from spacy.language import Language
-from edsnlp.pipelines.core.terminology import TerminologyMatcher, TerminologyTermMatcher
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .terminology import TerminologyMatcher
DEFAULT_CONFIG = dict(
terms=None,
- attr="TEXT",
regex=None,
+ attr="TEXT",
ignore_excluded=False,
ignore_space_tokens=False,
term_matcher="exact",
- term_matcher_config={},
+ term_matcher_config=None,
+ span_setter={"ents": True},
)
-
-@Language.factory(
+create_component = TerminologyMatcher
+create_component = deprecated_factory(
+ "terminology",
"eds.terminology",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- label: str,
- terms: Optional[Dict[str, Union[str, List[str]]]],
- name: str = "eds.terminology",
- attr: Union[str, Dict[str, str]] = "TEXT",
- regex: Optional[Dict[str, Union[str, List[str]]]] = None,
- ignore_excluded: bool = False,
- ignore_space_tokens: bool = False,
- term_matcher: TerminologyTermMatcher = "exact",
- term_matcher_config: Dict[str, Any] = {},
-):
- """
- Provides a terminology matching component.
-
- The terminology matching component differs from the simple matcher component in that
- the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
- have the same label, defined in the top-level constructor (argument `label`).
-
- Parameters
- ----------
- nlp : Language
- The spaCy object.
- name: str
- The name of the component.
- label : str
- Top-level label
- terms : Optional[Patterns]
- A dictionary of terms.
- regex : Optional[Patterns]
- A dictionary of regular expressions.
- attr : str
- The default attribute to use for matching.
- Can be overridden using the `terms` and `regex` configurations.
- ignore_excluded : bool
- Whether to skip excluded tokens (requires an upstream
- pipeline to mark excluded tokens).
- ignore_space_tokens: bool
- Whether to skip space tokens during matching.
- term_matcher: TerminologyTermMatcher
- The matcher to use for matching phrases ?
- One of (exact, simstring)
- term_matcher_config: Dict[str,Any]
- Parameters of the matcher class
- """
- assert not (terms is None and regex is None)
-
- return TerminologyMatcher(
- nlp,
- label=label,
- terms=terms or dict(),
- attr=attr,
- regex=regex or dict(),
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- term_matcher=term_matcher,
- term_matcher_config=term_matcher_config,
- )
+)(create_component)
+create_component = Language.factory(
+ "eds.terminology",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/core/terminology/terminology.py b/edsnlp/pipelines/core/terminology/terminology.py
index ac1735338..d35a801c2 100644
--- a/edsnlp/pipelines/core/terminology/terminology.py
+++ b/edsnlp/pipelines/core/terminology/terminology.py
@@ -1,37 +1,77 @@
-from enum import Enum
from itertools import chain
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
from spacy.language import Language
from spacy.tokens import Doc, Span
+from typing_extensions import Literal
from edsnlp.matchers.phrase import EDSPhraseMatcher
from edsnlp.matchers.regex import RegexMatcher
from edsnlp.matchers.simstring import SimstringMatcher
from edsnlp.matchers.utils import Patterns
-from edsnlp.pipelines.base import BaseComponent
-from edsnlp.utils.filter import filter_spans
+from edsnlp.pipelines.base import BaseNERComponent, SpanSetterArg
-class TerminologyTermMatcher(str, Enum):
- exact = "exact"
- simstring = "simstring"
+class TerminologyMatcher(BaseNERComponent):
+ r"""
+ EDS-NLP simplifies the terminology matching process by exposing a `eds.terminology`
+ pipeline that can match on terms or regular expressions.
+ The terminology matcher is very similar to the
+ [generic matcher][edsnlp.pipelines.core.matcher.factory.create_component],
+ although the use case differs slightly. The generic matcher is designed to extract
+ any entity, while the terminology matcher is specifically tailored towards high
+ volume terminologies.
-class TerminologyMatcher(BaseComponent):
- """
- Provides a terminology matching component.
+ There are some key differences:
+
+ 1. It labels every matched entity to the same value, provided to the pipeline
+ 2. The keys provided in the `regex` and `terms` dictionaries are used as the
+ `kb_id_` of the entity, which handles fine-grained labelling
+
+ For instance, a terminology matcher could detect every drug mention under the
+ top-level label `drug`, and link each individual mention to a given drug through
+ its `kb_id_` attribute.
+
+ Examples
+ --------
+ Let us redefine the pipeline :
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+
+ terms = dict(
+ covid=["coronavirus", "covid19"], # (1)
+ flu=["grippe saisonnière"], # (2)
+ )
+
+ regex = dict(
+ covid=r"coronavirus|covid[-\s]?19|sars[-\s]cov[-\s]2", # (3)
+ )
+
+ nlp.add_pipe(
+ "eds.terminology",
+ config=dict(
+ label="disease",
+ terms=terms,
+ regex=regex,
+ attr="LOWER",
+ ),
+ )
+ ```
+
+ 1. Every key in the `terms` dictionary is mapped to a concept.
+ 2. The `eds.matcher` pipeline expects a list of expressions, or a single expression.
+ 3. We can also define regular expression patterns.
- The terminology matching component differs from the simple matcher component in that
- the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
- have the same label, defined in the top-level constructor (argument `label`).
+ This snippet is complete, and should run as is.
Parameters
----------
nlp : Language
- The spaCy object.
- label : str
- Top-level label
+ The pipeline object
terms : Optional[Patterns]
A dictionary of terms.
regex : Optional[Patterns]
@@ -44,33 +84,55 @@ class TerminologyMatcher(BaseComponent):
pipeline to mark excluded tokens).
ignore_space_tokens: bool
Whether to skip space tokens during matching.
- term_matcher: TerminologyTermMatcher
+ term_matcher: Literal["exact", "simstring"]
The matcher to use for matching phrases ?
One of (exact, simstring)
term_matcher_config: Dict[str,Any]
Parameters of the matcher class
+ label: str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become
+ the `kb_id_` of the extracted entities. Dictionary values are either a single
+ expression or a list of expressions that match the concept (see [example](#usage)).
+
+ Authors and citation
+ --------------------
+ The `eds.terminology` pipeline was developed by AP-HP's Data Science team.
"""
def __init__(
self,
nlp: Language,
- label: str,
- terms: Optional[Patterns],
- regex: Optional[Patterns],
- attr: str,
- ignore_excluded: bool,
+ name: Optional[str] = None,
+ *,
+ terms: Optional[Patterns] = None,
+ regex: Optional[Patterns] = None,
+ attr: str = "TEXT",
+ ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
- term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
- term_matcher_config=None,
+ term_matcher: Literal["exact", "simstring"] = "exact",
+ term_matcher_config: Dict[str, Any] = None,
+ label,
+ span_setter: SpanSetterArg = {"ents": True},
):
+ self.label = label
- self.nlp = nlp
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
- self.label = label
+ if terms is None and regex is None:
+ raise ValueError(
+ "You must provide either `terms` or `regex` to the matcher."
+ )
+
+ terms = terms or {}
+ regex = regex or {}
self.attr = attr
- if term_matcher == TerminologyTermMatcher.exact:
+ if term_matcher == "exact":
self.phrase_matcher = EDSPhraseMatcher(
self.nlp.vocab,
attr=attr,
@@ -78,7 +140,7 @@ def __init__(
ignore_space_tokens=ignore_space_tokens,
**(term_matcher_config or {}),
)
- elif term_matcher == TerminologyTermMatcher.simstring:
+ elif term_matcher == "simstring":
self.phrase_matcher = SimstringMatcher(
vocab=self.nlp.vocab,
attr=attr,
@@ -128,8 +190,6 @@ def process(self, doc: Doc) -> List[Span]:
matches = self.phrase_matcher(doc, as_spans=True)
regex_matches = self.regex_matcher(doc, as_spans=True)
- spans = []
-
for match in chain(matches, regex_matches):
span = Span(
doc=match.doc,
@@ -139,9 +199,7 @@ def process(self, doc: Doc) -> List[Span]:
kb_id=match.label,
)
span._.set(self.label, match.label_)
- spans.append(span)
-
- return spans
+ yield span
def __call__(self, doc: Doc) -> Doc:
"""
@@ -159,15 +217,6 @@ def __call__(self, doc: Doc) -> Doc:
"""
matches = self.process(doc)
- if self.label not in doc.spans:
- doc.spans[self.label] = matches
-
- ents, discarded = filter_spans(list(doc.ents) + matches, return_discarded=True)
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
+ self.set_spans(doc, matches)
return doc
diff --git a/edsnlp/pipelines/factories.py b/edsnlp/pipelines/factories.py
index 6444bf523..0460491b4 100644
--- a/edsnlp/pipelines/factories.py
+++ b/edsnlp/pipelines/factories.py
@@ -1,13 +1,17 @@
-# flake8: noqa: F811
+# ruff: noqa: E501
+# flake8: noqa: F401, F811
from .core.context.factory import create_component as context
from .core.contextual_matcher.factory import create_component as contextual_matcher
from .core.endlines.factory import create_component as endlines
from .core.matcher.factory import create_component as matcher
from .core.normalizer.accents.factory import create_component as accents
from .core.normalizer.factory import create_component as normalizer
-from .core.normalizer.lowercase.factory import remove_lowercase
from .core.normalizer.pollution.factory import create_component as pollution
from .core.normalizer.quotes.factory import create_component as quotes
+from .core.normalizer.remove_lowercase.factory import (
+ create_component as remove_lowercase,
+)
+from .core.normalizer.spaces.factory import create_component as spaces
from .core.sentences.factory import create_component as sentences
from .core.terminology.factory import create_component as terminology
from .misc.consultation_dates.factory import create_component as consultation_dates
@@ -21,18 +25,18 @@
from .ner.behaviors.tobacco.factory import create_component as tobacco
from .ner.cim10.factory import create_component as cim10
from .ner.covid.factory import create_component as covid
-from .ner.disorders.AIDS.factory import create_component as AIDS
+from .ner.disorders.aids.factory import create_component as aids
from .ner.disorders.cerebrovascular_accident.factory import (
create_component as cerebrovascular_accident,
)
-from .ner.disorders.CKD.factory import create_component as CKD
+from .ner.disorders.ckd.factory import create_component as ckd
from .ner.disorders.congestive_heart_failure.factory import (
create_component as congestive_heart_failure,
)
from .ner.disorders.connective_tissue_disease.factory import (
create_component as connective_tissue_disease,
)
-from .ner.disorders.COPD.factory import create_component as COPD
+from .ner.disorders.copd.factory import create_component as copd
from .ner.disorders.dementia.factory import create_component as dementia
from .ner.disorders.diabetes.factory import create_component as diabetes
from .ner.disorders.hemiplegia.factory import create_component as hemiplegia
@@ -51,12 +55,13 @@
from .ner.disorders.solid_tumor.factory import create_component as solid_tumor
from .ner.drugs.factory import create_component as drugs
from .ner.scores.charlson.factory import create_component as charlson
+from .ner.scores.elston_ellis.factory import create_component as elston_ellis
from .ner.scores.emergency.ccmu.factory import create_component as ccmu
from .ner.scores.emergency.gemsa.factory import create_component as gemsa
from .ner.scores.emergency.priority.factory import create_component as priority
from .ner.scores.factory import create_component as score
from .ner.scores.sofa.factory import create_component as sofa
-from .ner.scores.tnm.factory import create_component as tnm
+from .ner.tnm.factory import create_component as tnm
from .ner.umls.factory import create_component as umls
from .qualifiers.family.factory import create_component as family
from .qualifiers.history.factory import create_component as history
diff --git a/edsnlp/pipelines/misc/consultation_dates/__init__.py b/edsnlp/pipelines/misc/consultation_dates/__init__.py
index a4540783d..e69de29bb 100644
--- a/edsnlp/pipelines/misc/consultation_dates/__init__.py
+++ b/edsnlp/pipelines/misc/consultation_dates/__init__.py
@@ -1 +0,0 @@
-from .consultation_dates import ConsultationDates
diff --git a/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py b/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
index d3e65028d..3a49d7301 100644
--- a/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
+++ b/edsnlp/pipelines/misc/consultation_dates/consultation_dates.py
@@ -4,21 +4,63 @@
from spacy.language import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.core.matcher import GenericMatcher
-from edsnlp.pipelines.misc.dates import Dates
-from edsnlp.pipelines.misc.dates.factory import DEFAULT_CONFIG
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
+from edsnlp.pipelines.misc.dates.factory import DEFAULT_CONFIG, DatesMatcher
+from ...base import SpanSetterArg
from . import patterns as consult_regex
-class ConsultationDates(GenericMatcher):
+class ConsultationDatesMatcher(GenericMatcher):
+ '''
+ The `eds.consultation-dates` matcher consists of two main parts:
+
+ - A **matcher** which finds mentions of _consultation events_ (more details below)
+ - A **date parser** (see the corresponding pipe) that links a date to those events
+
+ Examples
+ --------
+ !!! note
+
+ The matcher has been built to run on **consultation notes**
+ (`CR-CONS` at APHP), so please filter accordingly before proceeding.
+
+ ```python
+ import spacy
+
+ # HIHIHI
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ lowercase=True,
+ accents=True,
+ quotes=True,
+ pollution=False,
+ ),
+ )
+ nlp.add_pipe("eds.consultation_dates")
+
+ text = """
+ XXX
+ Objet : Compte-Rendu de Consultation du 03/10/2018.
+ XXX
"""
- Class to extract consultation dates from "CR-CONS" documents.
- The pipeline populates the `#!python doc.spans['consultation_dates']` list.
+ doc = nlp(text)
- For each extraction `s` in this list, the corresponding date is available
- as `s._.consultation_date`.
+ doc.spans["consultation_dates"]
+ # Out: [Consultation du 03/10/2018]
+
+ doc.spans["consultation_dates"][0]._.consultation_date.to_datetime()
+ # Out: DateTime(2018, 10, 3, 0, 0, 0, tzinfo=Timezone('Europe/Paris'))
+ ```
+
+ Extensions
+ ----------
+ The `eds.consultation_dates` pipeline declares one extension on the `Span` object:
+ the `consultation_date` attribute, which is a Python `datetime` object.
Parameters
----------
@@ -30,28 +72,49 @@ class ConsultationDates(GenericMatcher):
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
+ This list contains terms directly referring to consultations, such as
+ "_Consultation du..._" or "_Compte rendu du..._". This list is the only one
+ enabled by default since it is fairly precise and not error-prone.
town_mention : Union[List[str], bool]
List of RegEx for all AP-HP hospitals' towns mentions.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
+
+ This list contains the towns of each AP-HP's hospital. Its goal is to fetch
+ dates mentioned as "_Paris, le 13 décembre 2015_". It has a high recall but
+ poor precision, since those dates can often be dates of letter redaction
+ instead of consultation dates.
document_date_mention : Union[List[str], bool]
List of RegEx for document date.
- If `type==list`: Overrides the default list
- If `type==bool`: Uses the default list of True, disable if False
- """
+
+ This list contains expressions mentioning the date of creation/edition of a
+ document, such as "_Date du rapport: 13/12/2015_" or "_Signé le 13/12/2015_".
+ Like `town_mention` patterns, it has a high recall but is prone to errors since
+ document date and consultation date aren't necessary similar.
+
+ Authors and citation
+ --------------------
+ The `eds.consultation_dates` pipeline was developed by AP-HP's Data Science team.
+ '''
def __init__(
self,
nlp: Language,
- consultation_mention: Union[List[str], bool],
- town_mention: Union[List[str], bool],
- document_date_mention: Union[List[str], bool],
- attr: str,
- **kwargs,
+ name: str = "eds.consultation_dates",
+ *,
+ consultation_mention: Union[List[str], bool] = True,
+ town_mention: Union[List[str], bool] = False,
+ document_date_mention: Union[List[str], bool] = False,
+ attr: str = "NORM",
+ ignore_excluded: bool = False,
+ ignore_space_tokens: bool = False,
+ label: str = "consultation_date",
+ span_setter: SpanSetterArg = {"ents": True, "consultation_dates": True},
):
-
logger.warning("This pipeline is still in beta")
logger.warning(
"This pipeline should ONLY be used on notes "
@@ -65,11 +128,7 @@ def __init__(
)
if not (nlp.has_pipe("dates") and nlp.get_pipe("dates").on_ents_only is False):
-
- config = dict(**DEFAULT_CONFIG)
- config["on_ents_only"] = "consultation_mentions"
-
- self.date_matcher = Dates(nlp, **config)
+ self.date_matcher = DatesMatcher(nlp, **DEFAULT_CONFIG)
else:
self.date_matcher = None
@@ -94,24 +153,29 @@ def __init__(
town_mention=town_mention,
document_date_mention=document_date_mention,
)
+ self.label = label
super().__init__(
- nlp,
+ nlp=nlp,
+ name=name,
regex=regex,
terms=dict(),
attr=attr,
- ignore_excluded=False,
- **kwargs,
+ ignore_excluded=ignore_excluded,
+ ignore_space_tokens=ignore_space_tokens,
+ term_matcher="exact",
+ term_matcher_config=dict(),
+ span_setter=span_setter,
)
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
- if not Span.has_extension("consultation_date"):
- Span.set_extension("consultation_date", default=None)
+ def set_extensions(self) -> None:
+ super().set_extensions()
+ if not Span.has_extension(self.label):
+ Span.set_extension(self.label, default=None)
- def __call__(self, doc: Doc) -> Doc:
+ def process(self, doc: Doc) -> List[Span]:
"""
Finds entities
@@ -126,37 +190,31 @@ def __call__(self, doc: Doc) -> Doc:
`doc.spans['consultation_dates]` `SpanGroup`
"""
- ents = self.process(doc)
+ matches = list(super().process(doc))
- doc.spans["consultation_mentions"] = ents
- doc.spans["consultation_dates"] = []
+ self.date_matcher.span_getter = lambda d: [m.sent for m in matches]
+ dates = [s for s in self.date_matcher.process(doc) if s.label_ == "date"]
+ self.date_matcher.span_getter = None
- if self.date_matcher is not None:
- doc = self.date_matcher(doc)
-
- for mention in ents:
+ for match in matches:
# Looking for a date
# - In the same sentence
# - Not less than 10 tokens AFTER the consultation mention
matching_dates = [
date
- for date in doc.spans["dates"]
+ for date in dates
if (
- (mention.sent == date.sent)
- and (date.start > mention.start)
- and (date.start - mention.end <= 10)
+ (match.sent == date.sent)
+ and (date.start > match.start)
+ and (date.start - match.end <= 10)
)
]
if matching_dates:
# We keep the first mention of a date
kept_date = min(matching_dates, key=lambda d: d.start)
- span = doc[mention.start : kept_date.end]
- span.label_ = mention.label_
+ span = doc[match.start : kept_date.end]
+ span.label_ = self.label
span._.consultation_date = kept_date._.date
- doc.spans["consultation_dates"].append(span)
-
- del doc.spans["consultation_mentions"]
-
- return doc
+ yield span
diff --git a/edsnlp/pipelines/misc/consultation_dates/factory.py b/edsnlp/pipelines/misc/consultation_dates/factory.py
index 53d334139..e415db383 100644
--- a/edsnlp/pipelines/misc/consultation_dates/factory.py
+++ b/edsnlp/pipelines/misc/consultation_dates/factory.py
@@ -1,41 +1,26 @@
-from typing import List, Union
-
from spacy.language import Language
-from edsnlp.pipelines.misc.consultation_dates import ConsultationDates
from edsnlp.utils.deprecation import deprecated_factory
+from .consultation_dates import ConsultationDatesMatcher
+
DEFAULT_CONFIG = dict(
consultation_mention=True,
town_mention=False,
document_date_mention=False,
attr="NORM",
+ ignore_excluded=False,
+ ignore_spacy_tokens=False,
+ label="consultation_date",
+ span_setter={"ents": True, "consultation_dates": True},
)
-
-@deprecated_factory(
+create_component = deprecated_factory(
"consultation_dates",
"eds.consultation_dates",
- default_config=DEFAULT_CONFIG,
- assigns=["doc._.consultation_dates"],
-)
-@Language.factory(
+ assigns=["doc.spans", "doc.ents"],
+)(ConsultationDatesMatcher)
+create_component = Language.factory(
"eds.consultation_dates",
- default_config=DEFAULT_CONFIG,
- assigns=["doc._.consultation_dates"],
-)
-def create_component(
- nlp: Language,
- name: str,
- attr: str,
- consultation_mention: Union[List[str], bool],
- town_mention: Union[List[str], bool],
- document_date_mention: Union[List[str], bool],
-):
- return ConsultationDates(
- nlp,
- attr=attr,
- consultation_mention=consultation_mention,
- document_date_mention=document_date_mention,
- town_mention=town_mention,
- )
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/dates/__init__.py b/edsnlp/pipelines/misc/dates/__init__.py
index b9b4fba83..4bd4a4974 100644
--- a/edsnlp/pipelines/misc/dates/__init__.py
+++ b/edsnlp/pipelines/misc/dates/__init__.py
@@ -1 +1 @@
-from .dates import Dates
+from .dates import DatesMatcher
diff --git a/edsnlp/pipelines/misc/dates/dates.py b/edsnlp/pipelines/misc/dates/dates.py
index e8e5d708a..ec0f5af32 100644
--- a/edsnlp/pipelines/misc/dates/dates.py
+++ b/edsnlp/pipelines/misc/dates/dates.py
@@ -1,35 +1,109 @@
"""`eds.dates` pipeline."""
-
+import warnings
from itertools import chain
from typing import Dict, Iterable, List, Optional, Tuple, Union
from loguru import logger
from spacy.language import Language
from spacy.tokens import Doc, Span
+from typing_extensions import Literal
from edsnlp.matchers.regex import RegexMatcher
-from edsnlp.pipelines.base import BaseComponent
-from edsnlp.utils.filter import filter_spans
+from edsnlp.pipelines.base import (
+ BaseNERComponent,
+ SpanGetterArg,
+ SpanSetterArg,
+ get_spans,
+ validate_span_getter,
+)
+from edsnlp.utils.filter import align_spans, filter_spans
from . import patterns
-from .models import AbsoluteDate, Duration, Mode, Period, RelativeDate
-
-PERIOD_PROXIMITY_THRESHOLD = 3
+from .models import AbsoluteDate, Bound, Duration, Mode, Period, RelativeDate
-class Dates(BaseComponent):
+class DatesMatcher(BaseNERComponent):
"""
- Tags and normalizes dates, using the open-source `dateparser` library.
+ The `eds.dates` matcher detects and normalize dates within a medical document.
+ We use simple regular expressions to extract date mentions.
+
+ Scope
+ -----
+ The `eds.dates` pipeline finds absolute (eg `23/08/2021`) and relative (eg `hier`,
+ `la semaine dernière`) dates alike. It also handles mentions of duration.
+
+ | Type | Example |
+ | ---------- | ----------------------------- |
+ | `absolute` | `3 mai`, `03/05/2020` |
+ | `relative` | `hier`, `la semaine dernière` |
+ | `duration` | `pendant quatre jours` |
+
+ See the [tutorial](../../tutorials/detecting-dates.md) for a presentation of a
+ full pipeline featuring the `eds.dates` component.
+
+ ## Usage
+
+ ```python
+ import spacy
+
+ import pendulum
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.dates")
+
+ text = (
+ "Le patient est admis le 23 août 2021 pour une douleur à l'estomac. "
+ "Il lui était arrivé la même chose il y a un an pendant une semaine. "
+ "Il a été diagnostiqué en mai 1995."
+ )
+
+ doc = nlp(text)
+
+ dates = doc.spans["dates"]
+ dates
+ # Out: [23 août 2021, il y a un an, mai 1995]
+
+ dates[0]._.date.to_datetime()
+ # Out: 2021-08-23T00:00:00+02:00
+
+ dates[1]._.date.to_datetime()
+ # Out: None
+
+ note_datetime = pendulum.datetime(2021, 8, 27, tz="Europe/Paris")
+
+ dates[1]._.date.to_datetime(note_datetime=note_datetime)
+ # Out: 2020-08-27T00:00:00+02:00
+
+ date_2_output = dates[2]._.date.to_datetime(
+ note_datetime=note_datetime,
+ infer_from_context=True,
+ tz="Europe/Paris",
+ default_day=15,
+ )
+ date_2_output
+ # Out: 1995-05-15T00:00:00+02:00
+
+ doc.spans["durations"]
+ # Out: [pendant une semaine]
+ ```
+
+ Extensions
+ ----------
+ The `eds.dates` pipeline declares two extensions on the `Span` object:
+
+ - the `span._.date` attribute of a date contains a parsed version of the date.
+ - the `span._.duration` attribute of a duration contains a parsed version of the
+ duration.
- The pipeline uses spaCy's `filter_spans` function.
- It filters out false positives, and introduce a hierarchy between patterns.
- For instance, in case of ambiguity, the pipeline will decide that a date is a
- date without a year rather than a date without a day.
+ As with other components, you can use the `span._.value` attribute to get either the
+ parsed date or the duration depending on the span.
Parameters
----------
- nlp : spacy.language.Language
- Language pipeline object
+ nlp : Language
+ The pipeline object
+ name : Optional[str]
+ Name of the pipeline component
absolute : Union[List[str], str]
List of regular expressions for absolute dates.
relative : Union[List[str], str]
@@ -40,8 +114,20 @@ class Dates(BaseComponent):
(eg `pendant trois mois`).
false_positive : Union[List[str], str]
List of regular expressions for false positive (eg phone numbers, etc).
+ span_getter : SpanGetterArg
+ Where to look for dates in the doc. By default, look in the whole doc. You can
+ combine this with the `merge_mode` argument for interesting results.
+ merge_mode : Literal["intersect", "align"]
+ How to merge matched dates with the spans from `span_getter`, if given:
+
+ - `intersect`: return only the matches that fall in the `span_getter` spans
+ - `align`: if a date overlaps a span from `span_getter` (e.g. a date extracted
+ by a machine learning model), return the `span_getter` span instead, and
+ assign all the parsed information (`._.date` / `._.duration`) to it. Otherwise
+ don't return the date.
on_ents_only : Union[bool, str, Iterable[str]]
- Wether to look on dates in the whole document or in specific sentences:
+ Deprecated, use `span_getter` and `merge_mode` instead.
+ Whether to look on dates in the whole document or in specific sentences:
- If `True`: Only look in the sentences of each entity in doc.ents
- If False: Look in the whole document
@@ -49,28 +135,76 @@ class Dates(BaseComponent):
each entity in `#!python doc.spans[key]`
detect_periods : bool
Whether to detect periods (experimental)
+ detect_time: bool
+ Whether to detect time inside dates
+ period_proximity_threshold : int
+ Max number of words between two dates to extract a period.
as_ents : bool
+ Deprecated, use span_setter instead.
Whether to treat dates as entities
attr : str
spaCy attribute to use
+ date_label : str
+ Label to use for dates
+ duration_label : str
+ Label to use for durations
+ period_label : str
+ Label to use for periods
+ span_setter : SpanSetterArg
+ How to set matches in the doc.
+
+ Authors and citation
+ --------------------
+ The `eds.dates` pipeline was developed by AP-HP's Data Science team.
"""
# noinspection PyProtectedMember
def __init__(
self,
nlp: Language,
- absolute: Optional[List[str]],
- relative: Optional[List[str]],
- duration: Optional[List[str]],
- false_positive: Optional[List[str]],
- on_ents_only: Union[bool, str, Iterable[str]],
- detect_periods: bool,
- detect_time: bool,
- as_ents: bool,
- attr: str,
+ name: str = "eds.dates",
+ *,
+ absolute: Optional[List[str]] = None,
+ relative: Optional[List[str]] = None,
+ duration: Optional[List[str]] = None,
+ false_positive: Optional[List[str]] = None,
+ on_ents_only: Union[bool, str, Iterable[str]] = False,
+ span_getter: Optional[SpanGetterArg] = None,
+ merge_mode: Literal["intersect", "align"] = "intersect",
+ detect_periods: bool = False,
+ detect_time: bool = True,
+ period_proximity_threshold: int = 3,
+ as_ents: bool = False,
+ attr: str = "LOWER",
+ date_label: str = "date",
+ duration_label: str = "duration",
+ period_label: str = "period",
+ span_setter: SpanSetterArg = {
+ "dates": ["date"],
+ "durations": ["duration"],
+ "periods": ["period"],
+ },
):
-
- self.nlp = nlp
+ self.date_label = date_label
+ self.duration_label = duration_label
+ self.period_label = period_label
+
+ # Backward compatibility
+ if as_ents is True:
+ warnings.warn(
+ "The `as_ents` argument is deprecated."
+ + (
+ " Pass `span_setter={} instead.".format(
+ {**span_setter, "ents": [self.date_label, self.duration_label]}
+ )
+ if isinstance(span_setter, dict)
+ else " Use the `span_setter` argument instead."
+ ),
+ DeprecationWarning,
+ )
+ span_setter = dict(span_setter)
+ span_setter["ents"] = True
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
if absolute is None:
if detect_time:
@@ -93,7 +227,22 @@ def __init__(
if isinstance(false_positive, str):
false_positive = [false_positive]
- self.on_ents_only = on_ents_only
+ if on_ents_only:
+ assert span_getter is None, (
+ "Cannot use both `on_ents_only` and " "`span_getter`"
+ )
+
+ def span_getter(doc):
+ return (span.sent for span in doc.ents)
+
+ merge_mode = "intersect"
+ warnings.warn(
+ "The `on_ents_only` argument is deprecated."
+ " Use the `span_getter` argument instead.",
+ DeprecationWarning,
+ )
+ self.span_getter = validate_span_getter(span_getter, optional=True)
+ self.merge_mode = merge_mode
self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
self.regex_matcher.add("false_positive", false_positive)
@@ -102,6 +251,7 @@ def __init__(
self.regex_matcher.add("duration", duration)
self.detect_periods = detect_periods
+ self.period_proximity_threshold = period_proximity_threshold
self.as_ents = as_ents
@@ -110,8 +260,7 @@ def __init__(
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
+ def set_extensions(self) -> None:
"""
Set extensions for the dates pipeline.
"""
@@ -119,13 +268,16 @@ def set_extensions(cls) -> None:
if not Span.has_extension("datetime"):
Span.set_extension("datetime", default=None)
- if not Span.has_extension("date"):
- Span.set_extension("date", default=None)
+ if not Span.has_extension(self.date_label):
+ Span.set_extension(self.date_label, default=None)
+
+ if not Span.has_extension(self.duration_label):
+ Span.set_extension(self.duration_label, default=None)
- if not Span.has_extension("period"):
- Span.set_extension("period", default=None)
+ if not Span.has_extension(self.period_label):
+ Span.set_extension(self.period_label, default=None)
- def process(self, doc: Doc) -> List[Span]:
+ def process(self, doc: Doc) -> List[Tuple[Span, Dict[str, str]]]:
"""
Find dates in doc.
@@ -140,11 +292,14 @@ def process(self, doc: Doc) -> List[Span]:
list of date spans
"""
- if self.on_ents_only:
- dates = []
- for sent in set([ent.sent for ent in self.get_spans(doc)]):
- dates = chain(
- dates,
+ spans = None
+
+ if self.span_getter is not None:
+ spans = list(get_spans(doc, self.span_getter))
+ matches = []
+ for sent in set([ent.sent for ent in doc]):
+ matches = chain(
+ matches,
self.regex_matcher(
sent,
as_spans=True,
@@ -153,44 +308,74 @@ def process(self, doc: Doc) -> List[Span]:
)
else:
- dates = self.regex_matcher(
+ matches = self.regex_matcher(
doc,
as_spans=True,
return_groupdict=True,
)
- dates = filter_spans(dates)
- dates = [date for date in dates if date[0].label_ != "false_positive"]
+ matches = filter_spans(matches)
+ matches = [date for date in matches if date[0].label_ != "false_positive"]
+
+ matches = list(self.parse(matches))
+
+ if self.span_getter is not None:
+ if self.merge_mode == "align":
+ alignments = align_spans(matches, spans, sort_by_overlap=True)
+ matches = []
+ for span, aligned in zip(spans, alignments):
+ if len(aligned):
+ old = aligned[0]
+ span.label_ = old.label_
+ span._.set(self.date_label, old._.get(self.date_label))
+ span._.set(self.duration_label, old._.get(self.duration_label))
+ matches.append(span)
+
+ elif self.merge_mode == "intersect":
+ alignments = align_spans(matches, spans)
+ matches = []
+ for span, aligned in zip(spans, alignments):
+ matches.extend(aligned)
+ matches = list(dict.fromkeys(matches))
+
+ if self.detect_periods:
+ matches.extend(self.process_periods(matches))
- return dates
+ return matches
- def parse(self, dates: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
+ def parse(
+ self, matches: List[Tuple[Span, Dict[str, str]]]
+ ) -> Tuple[List[Span], List[Span]]:
"""
- Parse dates using the groupdict returned by the matcher.
+ Parse dates/durations using the groupdict returned by the matcher.
Parameters
----------
- dates : List[Tuple[Span, Dict[str, str]]]
+ matches : List[Tuple[Span, Dict[str, str]]]
List of tuples containing the spans and groupdict
returned by the matcher.
Returns
-------
- List[Span]
+ Tuple[List[Span], List[Span]]
List of processed spans, with the date parsed.
"""
- for span, groupdict in dates:
+ for span, groupdict in matches:
if span.label_ == "relative":
parsed = RelativeDate.parse_obj(groupdict)
+ span.label_ = self.date_label
+ span._.date = parsed
elif span.label_ == "absolute":
parsed = AbsoluteDate.parse_obj(groupdict)
+ span.label_ = self.date_label
+ span._.date = parsed
else:
parsed = Duration.parse_obj(groupdict)
+ span.label_ = self.duration_label
+ span._.duration = parsed
- span._.date = parsed
-
- return [span for span, _ in dates]
+ return [span for span, _ in matches]
def process_periods(self, dates: List[Span]) -> List[Span]:
"""
@@ -216,29 +401,39 @@ def process_periods(self, dates: List[Span]) -> List[Span]:
dates = list(sorted(dates, key=lambda d: d.start))
for d1, d2 in zip(dates[:-1], dates[1:]):
-
- if d1._.date.mode == Mode.DURATION or d2._.date.mode == Mode.DURATION:
+ v1 = d1._.date if d1.label_ == self.date_label else d1._.duration
+ v2 = d2._.date if d2.label_ == self.date_label else d2._.duration
+ if v1.mode == Mode.DURATION or v2.mode == Mode.DURATION:
pass
- elif d1 in seen or d1._.date.mode is None or d2._.date.mode is None:
+ elif d1 in seen or v1.bound is None or v2.bound is None:
continue
if (
- d1.end - d2.start < PERIOD_PROXIMITY_THRESHOLD
- and d1._.date.mode != d2._.date.mode
+ d1.end - d2.start < self.period_proximity_threshold
+ and v1.bound != v2.bound
):
-
- period = Span(d1.doc, d1.start, d2.end, label="period")
+ period = Span(d1.doc, d1.start, d2.end, label=self.period_label)
# If one date is a duration,
- # the other may not have a registered mode.
- m1 = d1._.date.mode or Mode.FROM
- m2 = d2._.date.mode or Mode.FROM
-
- period._.period = Period.parse_obj(
- {
- m1.value: d1,
- m2.value: d2,
- }
+ # the other may not have a registered bound attribute.
+ if v1.mode == Mode.DURATION:
+ m1 = Bound.FROM if v2.bound == Bound.UNTIL else Bound.UNTIL
+ m2 = v2.mode or Bound.FROM
+ elif v2.mode == Mode.DURATION:
+ m1 = v1.mode or Bound.FROM
+ m2 = Bound.FROM if v1.bound == Bound.UNTIL else Bound.UNTIL
+ else:
+ m1 = v1.mode or Bound.FROM
+ m2 = v2.mode or Bound.FROM
+
+ period._.set(
+ self.period_label,
+ Period.parse_obj(
+ {
+ m1: d1,
+ m2: d2,
+ }
+ ),
)
seen.add(d1)
@@ -262,23 +457,9 @@ def __call__(self, doc: Doc) -> Doc:
doc : Doc
spaCy Doc object, annotated for dates
"""
- dates = self.process(doc)
- dates = self.parse(dates)
-
- doc.spans["dates"] = dates
-
- if self.detect_periods:
- doc.spans["periods"] = self.process_periods(dates)
-
- if self.as_ents:
- ents, discarded = filter_spans(
- list(doc.ents) + dates, return_discarded=True
- )
- doc.ents = ents
+ matches = self.process(doc)
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
+ self.set_spans(doc, matches)
return doc
diff --git a/edsnlp/pipelines/misc/dates/factory.py b/edsnlp/pipelines/misc/dates/factory.py
index ec2f71022..a09a4feae 100644
--- a/edsnlp/pipelines/misc/dates/factory.py
+++ b/edsnlp/pipelines/misc/dates/factory.py
@@ -1,50 +1,38 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from . import Dates
+from .dates import DatesMatcher
DEFAULT_CONFIG = dict(
absolute=None,
relative=None,
duration=None,
false_positive=None,
+ on_ents_only=False,
+ span_getter=None,
+ merge_mode="intersect",
detect_periods=False,
detect_time=True,
- on_ents_only=False,
+ period_proximity_threshold=3,
as_ents=False,
attr="LOWER",
+ date_label="date",
+ duration_label="duration",
+ period_label="period",
+ span_setter={
+ "dates": ["date"],
+ "durations": ["duration"],
+ "periods": ["period"],
+ },
)
-
-@deprecated_factory(
- "dates", "eds.dates", default_config=DEFAULT_CONFIG, assigns=["doc.spans"]
-)
-@Language.factory("eds.dates", default_config=DEFAULT_CONFIG, assigns=["doc.spans"])
-def create_component(
- nlp: Language,
- name: str,
- absolute: Optional[List[str]],
- relative: Optional[List[str]],
- duration: Optional[List[str]],
- false_positive: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- detect_periods: bool,
- detect_time: bool,
- as_ents: bool,
- attr: str,
-):
- return Dates(
- nlp,
- absolute=absolute,
- relative=relative,
- duration=duration,
- false_positive=false_positive,
- on_ents_only=on_ents_only,
- detect_periods=detect_periods,
- detect_time=detect_time,
- as_ents=as_ents,
- attr=attr,
- )
+create_component = deprecated_factory(
+ "dates",
+ "eds.dates",
+ assigns=["doc.spans", "doc.ents"],
+)(DatesMatcher)
+create_component = Language.factory(
+ "eds.dates",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/dates/models.py b/edsnlp/pipelines/misc/dates/models.py
index f676c8060..7f6fb15a4 100644
--- a/edsnlp/pipelines/misc/dates/models.py
+++ b/edsnlp/pipelines/misc/dates/models.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+import datetime
from enum import Enum
from typing import Dict, Optional, Union
@@ -10,18 +10,21 @@
from edsnlp.pipelines.misc.dates.patterns.relative import specific_dict
-class Direction(Enum):
+class Direction(str, Enum):
+ FUTURE = "future"
+ PAST = "past"
+ CURRENT = "current"
- FUTURE = "FUTURE"
- PAST = "PAST"
- CURRENT = "CURRENT"
+class Bound(str, Enum):
+ UNTIL = "until"
+ FROM = "from"
-class Mode(Enum):
- FROM = "FROM"
- UNTIL = "UNTIL"
- DURATION = "DURATION"
+class Mode(str, Enum):
+ ABSOLUTE = "absolute"
+ RELATIVE = "relative"
+ DURATION = "duration"
class Period(BaseModel):
@@ -34,8 +37,8 @@ class Config:
class BaseDate(BaseModel):
-
- mode: Optional[Mode] = None
+ mode: Mode = None
+ bound: Optional[Bound] = None
@validator("*", pre=True)
def remove_space(cls, v):
@@ -57,7 +60,7 @@ def validate_strings(cls, d: Dict[str, str]) -> Dict[str, str]:
class AbsoluteDate(BaseDate):
-
+ mode: Mode = Mode.ABSOLUTE
year: Optional[int] = None
month: Optional[int] = None
day: Optional[int] = None
@@ -67,60 +70,100 @@ class AbsoluteDate(BaseDate):
def to_datetime(
self,
+ note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None,
tz: Union[str, pendulum.tz.timezone] = "Europe/Paris",
- note_datetime: Optional[datetime] = None,
- infer_from_context: bool = False,
+ infer_from_context: Optional[bool] = None,
default_day=1,
default_month=1,
**kwargs,
) -> Optional[pendulum.datetime]:
+ """
+ Convert the date to a pendulum.datetime object.
+
+ Parameters
+ ----------
+ tz : Optional[Union[str, pendulum.tz.timezone]]
+ The timezone to use. Defaults to "Europe/Paris".
+ note_datetime : Optional[Union[pendulum.datetime, datetime.datetime]]
+ The datetime of the note. Used to infer missing parts of the date.
+ infer_from_context : bool
+ Whether to infer missing parts of the date from the note datetime.
+ In a (year, month, day) triplet:
+
+ - if only year is missing, it will be inferred from the note datetime
+ - if only month is missing, it will be inferred from the note datetime
+ - if only day is missing, it will be set to `default_day`
+ - if only the year is given, the day and month will be set to
+ `default_day` and `default_month`
+ - if only the month is given, the day will be set to `default_day`
+ and the year will be inferred from the note datetime
+ - if only the day is given, the month and year will be inferred from
+ the note datetime
+ default_day : int
+ Default day to use when inferring missing parts of the date.
+ default_month : int
+ Default month to use when inferring missing parts of the date.
+
+ Returns
+ -------
+ Optional[pendulum.datetime]
+ """
d = self.dict(exclude_none=True)
d.pop("mode", None)
+ d.pop("bound", None)
+
if self.year and self.month and self.day:
try:
return pendulum.datetime(**d, tz=tz)
except ValueError:
return None
+ elif (
+ infer_from_context
+ or infer_from_context is None
+ and note_datetime is not None
+ ):
+ if note_datetime and not isinstance(note_datetime, NaTType):
+ note_datetime = pendulum.instance(note_datetime)
+
+ if self.year is None:
+ d["year"] = note_datetime.year
+ if self.month is None:
+ if self.day is None:
+ d["month"] = default_month
+ else:
+ d["month"] = note_datetime.month
+ if self.day is None:
+ d["day"] = default_day
+ else:
+ if self.year is None:
+ return None
+ if self.month is None:
+ d["month"] = default_month
+ if self.day is None:
+ d["day"] = default_day
- elif infer_from_context:
- # no year
- if (
- not self.year
- and self.month
- and self.day
- and note_datetime
- and not isinstance(note_datetime, NaTType)
- ):
- d["year"] = note_datetime.year
+ try:
return pendulum.datetime(**d, tz=tz)
+ except ValueError:
+ return None
- # no day
- elif self.year and self.month and not self.day:
- d["day"] = default_day
- return pendulum.datetime(**d, tz=tz)
+ return None
- # year only
- elif self.year and not self.month and not self.day:
- d["day"] = default_day
- d["month"] = default_month
- return pendulum.datetime(**d, tz=tz)
+ def to_duration(
+ self,
+ note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None,
+ **kwargs,
+ ) -> Optional[pendulum.Duration]:
- # month only
- elif (
- not self.year
- and self.month
- and not self.day
- and note_datetime
- and not isinstance(note_datetime, NaTType)
- ):
- d["day"] = default_day
- d["year"] = note_datetime.year
- return pendulum.datetime(**d, tz=tz)
+ if note_datetime and not isinstance(note_datetime, NaTType):
+ note_datetime = pendulum.instance(note_datetime)
+ dt = self.to_datetime(note_datetime=note_datetime, **kwargs)
+ delta = dt.diff(note_datetime)
+ return delta.as_interval()
+ else:
return None
- return None
-
def norm(self) -> str:
year = str(self.year) if self.year else "????"
@@ -148,9 +191,12 @@ def validate_year(cls, v):
if v < 25:
return 2000 + v
+ def __str__(self):
+ return self.norm()
-class Relative(BaseDate):
+class Relative(BaseDate):
+ mode: Mode = Mode.RELATIVE
year: Optional[int] = None
month: Optional[int] = None
week: Optional[int] = None
@@ -184,7 +230,7 @@ def parse_unit(cls, d: Dict[str, str]) -> Dict[str, str]:
return d
- def to_datetime(self, **kwargs) -> pendulum.Duration:
+ def to_duration(self, note_datetime=None, **kwargs) -> pendulum.Duration:
d = self.dict(exclude_none=True)
direction = d.pop("direction", None)
@@ -197,21 +243,38 @@ def to_datetime(self, **kwargs) -> pendulum.Duration:
td = dir * pendulum.duration(**d)
return td
+ def to_datetime(self, **kwargs) -> Optional[pendulum.datetime]:
+ # for compatibility
+ return None
+
class RelativeDate(Relative):
direction: Direction = Direction.CURRENT
def to_datetime(
self,
- note_datetime: Optional[datetime] = None,
+ note_datetime: Optional[Union[pendulum.datetime, datetime.datetime]] = None,
**kwargs,
- ) -> pendulum.Duration:
- td = super(RelativeDate, self).to_datetime()
+ ) -> Optional[pendulum.datetime]:
if note_datetime is not None and not isinstance(note_datetime, NaTType):
+ note_datetime = pendulum.instance(note_datetime)
+
+ d = self.dict(exclude_none=True)
+
+ direction = d.pop("direction", None)
+ dir = -1 if direction == Direction.PAST else 1
+
+ d.pop("mode", None)
+ d.pop("bound", None)
+
+ d = {f"{k}s": v for k, v in d.items()}
+
+ td = dir * pendulum.duration(**d)
+
return note_datetime + td
- return td
+ return None
def norm(self) -> str:
@@ -224,7 +287,7 @@ def norm(self) -> str:
norm = f"~0 {key}"
else:
- td = self.to_datetime()
+ td = self.to_duration()
norm = str(td)
if td.in_seconds() > 0:
norm = f"+{norm}"
@@ -256,11 +319,20 @@ def handle_specifics(cls, d: Dict[str, str]) -> Dict[str, str]:
return d
+ def __str__(self):
+ return self.norm()
+
class Duration(Relative):
mode: Mode = Mode.DURATION
def norm(self) -> str:
-
- td = self.to_datetime()
+ td = self.to_duration()
return f"during {td}"
+
+ def to_duration(self, note_datetime=None, **kwargs) -> pendulum.Duration:
+ d = self.dict(exclude_none=True)
+
+ d = {f"{k}s": v for k, v in d.items() if k not in ("mode", "bound")}
+
+ return pendulum.duration(**d)
diff --git a/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py b/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py
index 12c57ad1d..b467ce83d 100644
--- a/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py
+++ b/edsnlp/pipelines/misc/dates/patterns/atomic/directions.py
@@ -1,13 +1,13 @@
from edsnlp.utils.regex import make_pattern
preceding_directions = [
- r"(?Pdepuis|depuis\s+le|il\s+y\s+a|à)",
- r"(?Pdans)",
+ r"(?Pdepuis|depuis\s+le|il\s+y\s+a|à)",
+ r"(?Pdans)",
]
following_directions = [
- r"(?Pprochaine?s?|suivante?s?|plus\s+tard)",
- r"(?Pderni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)",
+ r"(?Pprochaine?s?|suivante?s?|plus\s+tard)",
+ r"(?Pderni[eè]re?s?|passée?s?|pr[ée]c[ée]dente?s?|plus\s+t[ôo]t)",
]
preceding_direction_pattern = make_pattern(preceding_directions, with_breaks=True)
diff --git a/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py b/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py
index 001c65d07..d888f5015 100644
--- a/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py
+++ b/edsnlp/pipelines/misc/dates/patterns/atomic/modes.py
@@ -1,8 +1,8 @@
from edsnlp.utils.regex import make_pattern
modes = [
- r"(?Pdepuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)",
- r"(?Pjusqu'[àa]u?|au)",
+ r"(?Pdepuis|depuis\s+le|[àa]\s+partir\s+d[eu]|du)",
+ r"(?Pjusqu'[àa]u?|au)",
]
mode_pattern = make_pattern(modes, with_breaks=True)
diff --git a/edsnlp/pipelines/misc/dates/patterns/relative.py b/edsnlp/pipelines/misc/dates/patterns/relative.py
index b37168e6f..be8279faf 100644
--- a/edsnlp/pipelines/misc/dates/patterns/relative.py
+++ b/edsnlp/pipelines/misc/dates/patterns/relative.py
@@ -32,10 +32,10 @@ def make_specific_pattern(mode: str = "forward"):
specific = {
- "minus1": (r"hier", dict(direction="PAST", day=1)),
- "minus2": (r"avant[-\s]hier", dict(direction="PAST", day=2)),
- "plus1": (r"demain", dict(direction="FUTURE", day=1)),
- "plus2": (r"après[-\s]demain", dict(direction="FUTURE", day=2)),
+ "minus1": (r"hier", dict(direction="past", day=1)),
+ "minus2": (r"avant[-\s]hier", dict(direction="past", day=2)),
+ "plus1": (r"demain", dict(direction="future", day=1)),
+ "plus2": (r"après[-\s]demain", dict(direction="future", day=2)),
}
specific_pattern = make_pattern(
diff --git a/edsnlp/pipelines/misc/measurements/factory.py b/edsnlp/pipelines/misc/measurements/factory.py
index e4acfd7d3..3cd76091b 100644
--- a/edsnlp/pipelines/misc/measurements/factory.py
+++ b/edsnlp/pipelines/misc/measurements/factory.py
@@ -1,62 +1,35 @@
-from typing import Dict, List, Optional, Tuple, Union
-
from spacy.language import Language
-import edsnlp.pipelines.misc.measurements.patterns as patterns
-from edsnlp.pipelines.misc.measurements.measurements import (
- MeasureConfig,
- MeasurementsMatcher,
- MergeStrategy,
- UnitConfig,
-)
from edsnlp.utils.deprecation import deprecated_factory
+from . import patterns
+from .measurements import MeasurementsMatcher
+
DEFAULT_CONFIG = dict(
- attr="NORM",
- measurements=None,
- ignore_excluded=True,
+ measurements=list(patterns.common_measurements.keys()), # noqa: E501
units_config=patterns.units_config,
number_terms=patterns.number_terms,
- unit_divisors=patterns.unit_divisors,
stopwords=patterns.stopwords,
+ unit_divisors=patterns.unit_divisors,
+ ignore_excluded=True,
compose_units=True,
+ attr="NORM",
extract_ranges=False,
range_patterns=patterns.range_patterns,
+ after_snippet_limit=6,
+ before_snippet_limit=10,
+ span_getter=None,
+ merge_mode="intersect",
as_ents=False,
- merge_mode=MergeStrategy.union,
+ span_setter=None,
)
-
-@Language.factory("eds.measurements", default_config=DEFAULT_CONFIG)
-@deprecated_factory("eds.measures", "eds.measurements", default_config=DEFAULT_CONFIG)
-def create_component(
- nlp: Language,
- name: str,
- measurements: Optional[Union[Dict[str, MeasureConfig], List[str]]],
- units_config: Dict[str, UnitConfig],
- number_terms: Dict[str, List[str]],
- stopwords: List[str],
- unit_divisors: List[str],
- ignore_excluded: bool,
- compose_units: bool,
- attr: str,
- extract_ranges: bool,
- range_patterns: List[Tuple[Optional[str], Optional[str]]],
- as_ents: bool,
- merge_mode: MergeStrategy,
-):
- return MeasurementsMatcher(
- nlp,
- measurements=measurements,
- units_config=units_config,
- number_terms=number_terms,
- stopwords=stopwords,
- unit_divisors=unit_divisors,
- ignore_excluded=ignore_excluded,
- compose_units=compose_units,
- attr=attr,
- extract_ranges=extract_ranges,
- range_patterns=range_patterns,
- as_ents=as_ents,
- merge_mode=merge_mode,
- )
+create_component = deprecated_factory(
+ "eds.measures",
+ "eds.measurements",
+ assigns=["doc.spans", "doc.ents"],
+)(MeasurementsMatcher)
+create_component = Language.factory(
+ "eds.measurements",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/measurements/measurements.py b/edsnlp/pipelines/misc/measurements/measurements.py
index a1262cb95..5fb2414b1 100644
--- a/edsnlp/pipelines/misc/measurements/measurements.py
+++ b/edsnlp/pipelines/misc/measurements/measurements.py
@@ -2,7 +2,6 @@
import re
import unicodedata
from collections import defaultdict
-from enum import Enum
from functools import lru_cache
from itertools import repeat
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
@@ -10,37 +9,27 @@
import regex
import spacy
from spacy.tokens import Doc, Span
-from typing_extensions import NotRequired, TypedDict
+from typing_extensions import Literal, NotRequired, TypedDict
from edsnlp.matchers.phrase import EDSPhraseMatcher
from edsnlp.matchers.regex import RegexMatcher
from edsnlp.matchers.utils import get_text
+from edsnlp.pipelines.base import (
+ BaseNERComponent,
+ SpanGetterArg,
+ SpanSetterArg,
+ get_spans,
+ validate_span_getter,
+)
from edsnlp.pipelines.misc.measurements import patterns
-from edsnlp.utils.collections import dedup
from edsnlp.utils.filter import align_spans, filter_spans, get_span_group
__all__ = ["MeasurementsMatcher"]
-
AFTER_SNIPPET_LIMIT = 6
BEFORE_SNIPPET_LIMIT = 10
-class MergeStrategy(str, Enum):
- """
- The strategy to use when merging measurements.
- """
-
- # Align the new measurement to existing spans
- align = "align"
-
- # Only extract measurements if they fall within an existing span
- intersect = "intersect"
-
- # Extract measurements regardless of whether they fall within an existing span
- union = "union"
-
-
class UnitConfig(TypedDict):
dim: str
degree: int
@@ -66,7 +55,7 @@ class UnitlessPatternConfigWithName(TypedDict):
name: str
-class MeasureConfig(TypedDict):
+class MsrConfig(TypedDict):
unit: str
unitless_patterns: NotRequired[List[UnitlessPatternConfig]]
name: NotRequired[str]
@@ -270,93 +259,246 @@ def verify(cls, ent):
return True
-class MeasurementsMatcher:
+class MeasurementsMatcher(BaseNERComponent):
+ '''
+ The `eds.measurements` matcher detects and normalizes numerical measurements
+ within a medical document.
+
+ !!! warning
+
+ The ``measurements`` pipeline is still in active development and has not
+ been rigorously validated. If you come across a measurement expression that
+ goes undetected, please file an issue !
+
+ Scope
+ -----
+ The `eds.measurements` matcher can extract simple (e.g. `3cm`) measurements.
+ It can also detect elliptic enumerations (eg `32, 33 et 34kg`) of measurements
+ of the same type and split the measurements accordingly.
+
+ The normalized value can then be accessed via the `span._.{measure_name}` attribute,
+ for instance `span._.size` or `span._.weight` and be converted on the fly to a
+ desired unit. Like for other components, the `span._.value` extension can also be
+ used to access the normalized value for any measurement span.
+
+
+ The current matcher annotates the following measurements out of the box:
+
+ | Measurement name | Example |
+ |------------------|------------------------|
+ | `size` | `1m50`, `1.50m` |
+ | `weight` | `12kg`, `1kg300` |
+ | `bmi` | `BMI: 24`, `24 kg.m-2` |
+ | `volume` | `2 cac`, `8ml` |
+
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe(
+ "eds.measurements",
+ config=dict(
+ measurements=["size", "weight", "bmi"],
+ extract_ranges=True,
+ ),
+ )
+
+ text = """
+ Le patient est admis hier, fait 1m78 pour 76kg.
+ Les deux nodules bénins sont larges de 1,2 et 2.4mm.
+ BMI: 24.
+
+ Le nodule fait entre 1 et 1.5 cm
+ """
+
+ doc = nlp(text)
+
+ measurements = doc.spans["measurements"]
+
+ measurements
+ # Out: [1m78, 76kg, 1,2, 2.4mm, 24, entre 1 et 1.5 cm]
+
+ measurements[0]
+ # Out: 1m78
+
+ str(measurements[0]._.size), str(measurements[0]._.value)
+ # Out: ('1.78 m', '1.78 m')
+
+ measurements[0]._.value.cm
+ # Out: 178.0
+
+ measurements[2]
+ # Out: 1,2
+
+ str(measurements[2]._.value)
+ # Out: '1.2 mm'
+
+ str(measurements[2]._.value.mm)
+ # Out: 1.2
+
+ measurements[4]
+ # Out: 24
+
+ str(measurements[4]._.value)
+ # Out: '24 kg_per_m2'
+
+ str(measurements[4]._.value.kg_per_m2)
+ # Out: 24
+
+ str(measurements[5]._.value)
+ # Out: 1-1.5 cm
+ ```
+
+ To extract all sizes in centimeters, and average range measurements, you can
+ use the following snippet:
+
+ ```python
+ sizes = [
+ sum(item.cm for item in m._.value) / len(m._.value)
+ for m in doc.spans["measurements"]
+ if m.label_ == "size"
+ ]
+ sizes
+ # Out: [178.0, 0.12, 0.24, 1.25]
+ ```
+
+ Customization
+ -------------
+ You can declare custom measurements by altering the patterns:
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe(
+ "eds.measurements",
+ config=dict(
+ measurements={
+ "my_custom_surface_measurement": {
+ # This measurement unit is homogenous to square meters
+ "unit": "m2",
+ # Handle cases like "surface: 1.8" (implied m2),
+ # vs "surface: 50" (implied cm2)
+ "unitless_patterns": [
+ {
+ "terms": ["surface", "aire"],
+ "ranges": [
+ {"unit": "m2", "min": 0, "max": 9},
+ {"unit": "cm2", "min": 10, "max": 100},
+ ],
+ }
+ ],
+ },
+ }
+ ),
+ )
+ ```
+
+ Extensions
+ ----------
+ The `eds.measurements` pipeline declares its extensions dynamically, depending
+ on the `measurements` parameter: each measurement gets its own extension, and
+ is assigned to a different span group.
+
+ Parameters
+ ----------
+ nlp : Language
+ The pipeline object
+ name : str
+ The name of the component.
+ measurements : Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]]
+ A mapping from measure names to MsrConfig
+ Each measure's configuration has the following shape:
+ ```{ .python .no-check }
+ {
+ # the unit (e.g. "kg"),
+ "unit": str,
+ "unitless_patterns": {
+ # preceding trigger terms
+ "terms": List[str],
+ # unitless ranges -> unit patterns
+ "ranges": List[
+ {"min": int, "max": int, "unit": str},
+ {"min": int, "unit": str},
+ ...,
+ ],
+ ...
+ }
+ }
+ ```
+ number_terms: Dict[str, List[str]
+ A mapping of numbers to their lexical variants
+ stopwords: List[str]
+ A list of stopwords that do not matter when placed between a unitless
+ trigger
+ and a number
+ unit_divisors: List[str]
+ A list of terms used to divide two units (like: m / s)
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ ignore_excluded : bool
+ Whether to exclude pollution patterns when matching in the text
+ compose_units: bool
+ Whether to compose units (like "m/s" or "m.s-1")
+ extract_ranges: bool
+ Whether to extract ranges (like "entre 1 et 2 cm")
+ range_patterns: List[Tuple[str, str]]
+ A list of "{FROM} xx {TO} yy" patterns to match range measurements
+ after_snippet_limit: int
+ Maximum word distance after to link a part of a measurement after its number
+ before_snippet_limit: int
+ Maximum word distance after to link a part of a measurement before its number
+ span_setter: Optional[SpanSetterArg]
+ How to set the spans in the document. By default, each measurement will
+ be assigned to its own span group (using either the "name" field of the
+ config, or the key if you passed a dict), and to the "measurements" group.
+ span_getter : SpanGetterArg
+ Where to look for measurements in the doc. By default, look in the whole doc.
+ You can combine this with the `merge_mode` argument for interesting results.
+ merge_mode : Literal["intersect", "align"]
+ How to merge matches with the spans from `span_getter`, if given:
+
+ - `intersect`: return only the matches that fall in the `span_getter` spans
+ - `align`: if a match overlaps a span from `span_getter` (e.g. a match
+ extracted by a machine learning model), return the `span_getter` span
+ instead, and assign all the parsed information (`._.date` / `._.duration`)
+ to it. Otherwise, don't return the date.
+
+ Authors and citation
+ --------------------
+ The `eds.measurements` pipeline was developed by AP-HP's Data Science team.
+ '''
+
+ # fmt: off
def __init__(
- self,
- nlp: spacy.Language,
- measurements: Optional[
- Union[
- List[Union[str, MeasureConfig]],
- Dict[str, MeasureConfig],
- ]
- ] = None,
- units_config: Dict[str, UnitConfig] = patterns.units_config,
- number_terms: Dict[str, List[str]] = patterns.number_terms,
- stopwords: List[str] = patterns.stopwords,
- unit_divisors: List[str] = patterns.unit_divisors,
- name: str = "measurements",
- ignore_excluded: bool = True,
- compose_units: bool = True,
- attr: str = "NORM",
- extract_ranges: bool = False,
- range_patterns: List[
- Tuple[Optional[str], Optional[str]]
- ] = patterns.range_patterns, # noqa: E501
- as_ents: bool = False,
- merge_mode: MergeStrategy = MergeStrategy.union,
+ self,
+ nlp: spacy.Language,
+ name: str = "eds.measurements",
+ *,
+ measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_measurements.keys()), # noqa: E501
+ units_config: Dict[str, UnitConfig] = patterns.units_config,
+ number_terms: Dict[str, List[str]] = patterns.number_terms,
+ stopwords: List[str] = patterns.stopwords,
+ unit_divisors: List[str] = patterns.unit_divisors,
+ ignore_excluded: bool = True,
+ compose_units: bool = True,
+ attr: str = "NORM",
+ extract_ranges: bool = False,
+ range_patterns: List[Tuple[Optional[str], Optional[str]]] = patterns.range_patterns, # noqa: E501
+ after_snippet_limit: int = 6,
+ before_snippet_limit: int = 10,
+ span_getter: Optional[SpanGetterArg] = None,
+ merge_mode: Literal["intersect", "align"] = "intersect",
+ as_ents: bool = False,
+ span_setter: Optional[SpanSetterArg] = None,
):
- """
- Matcher component to extract measurements.
- A measurements is most often composed of a number and a unit like
- > 1,26 cm
- The unit can also be positioned in place of the decimal dot/comma
- > 1 cm 26
- Some measurements can be composite
- > 1,26 cm x 2,34 mm
- And sometimes they are factorized
- > Les trois kystes mesurent 1, 2 et 3cm.
-
- The recognized measurements are stored in the "measurements" SpanGroup.
- Each span has a `Measurement` object stored in the "value" extension attribute.
-
- Parameters
- ----------
- nlp : Language
- The SpaCy object.
- measurements : Optional[Union[List[Union[str, MeasureConfig]],Dict[str, MeasureConfig]]]
- A mapping from measure names to MeasureConfig
- Each measure's configuration has the following shape:
- {
- "unit": str, # the unit of the measure (like "kg"),
- "unitless_patterns": { # optional patterns to handle unitless cases
- "terms": List[str], # list of preceding terms used to trigger the
- measure
- # Mapping from ranges to unit to handle cases like
- # ("Taille: 1.2" -> 1.20 m vs "Taille: 120" -> 120cm)
- "ranges": List[{
- "min": int,
- "max": int,
- "unit": str,
- }, {
- "min": int,
- "unit": str,
- }, ...],
- }
- number_terms: Dict[str, List[str]
- A mapping of numbers to their lexical variants
- stopwords: List[str]
- A list of stopwords that do not matter when placed between a unitless
- trigger
- and a number
- unit_divisors: List[str]
- A list of terms used to divide two units (like: m / s)
- attr : str
- Whether to match on the text ('TEXT') or on the normalized text ('NORM')
- ignore_excluded : bool
- Whether to exclude pollution patterns when matching in the text
- compose_units: bool
- Whether to compose units (like "m/s" or "m.s-1")
- extract_ranges: bool
- Whether to extract ranges (like "entre 1 et 2 cm")
- range_patterns: List[Tuple[str, str]]
- A list of "{FROM} xx {TO} yy" patterns to match range measurements
- """ # noqa E501
-
- if measurements is None:
- measurements = [
- {**m, "name": k} for k, m in patterns.common_measurements.items()
- ]
- elif isinstance(measurements, (list, tuple)):
+ # fmt: on
+ if isinstance(measurements, str):
+ measurements = [measurements]
+ if isinstance(measurements, (list, tuple)):
measurements = [
m
if isinstance(m, dict)
@@ -369,6 +511,41 @@ def __init__(
self.nlp = nlp
self.name = name
self.unit_registry = UnitRegistry(units_config)
+ self.unitless_patterns: Dict[str, UnitlessPatternConfigWithName] = {}
+ self.unit_part_label_hashes: Set[int] = set()
+ self.unitless_label_hashes: Set[int] = set()
+ self.unit_followers: Dict[str, str] = {}
+ self.measure_names: Dict[str, str] = {}
+ self.compose_units = compose_units
+ self.extract_ranges = extract_ranges
+ self.range_patterns = range_patterns
+ self.span_getter = (
+ validate_span_getter(span_getter)
+ if span_getter is not None
+ else None
+ )
+ self.merge_mode = merge_mode
+ self.before_snippet_limit = before_snippet_limit
+ self.after_snippet_limit = after_snippet_limit
+
+ # MEASURES
+ for measure_config in measurements:
+ name = measure_config["name"]
+ unit = measure_config["unit"]
+ self.measure_names[self.unit_registry.parse_unit(unit)[0]] = name
+
+ if span_setter is None:
+ span_setter = {
+ "ents": as_ents,
+ "measurements": True,
+ **{
+ name: [name]
+ for name in self.measure_names.values()
+ }
+ }
+
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
+
self.regex_matcher = RegexMatcher(
attr=attr,
ignore_excluded=True,
@@ -379,16 +556,21 @@ def __init__(
ignore_excluded=ignore_excluded,
ignore_space_tokens=True,
)
- self.unitless_patterns: Dict[str, UnitlessPatternConfigWithName] = {}
- self.unit_part_label_hashes: Set[int] = set()
- self.unitless_label_hashes: Set[int] = set()
- self.unit_followers: Dict[str, str] = {}
- self.measure_names: Dict[str, str] = {}
- self.as_ents = as_ents
- self.compose_units = compose_units
- self.extract_ranges = extract_ranges
- self.range_patterns = range_patterns
- self.merge_mode = merge_mode
+ for measure_config in measurements:
+ name = measure_config["name"]
+ unit = measure_config["unit"]
+ self.measure_names[self.unit_registry.parse_unit(unit)[0]] = name
+ if "unitless_patterns" in measure_config:
+ for pattern in measure_config["unitless_patterns"]:
+ pattern_name = f"unitless_{len(self.unitless_patterns)}"
+ self.term_matcher.build_patterns(
+ nlp,
+ terms={
+ pattern_name: pattern["terms"],
+ },
+ )
+ self.unitless_label_hashes.add(nlp.vocab.strings[pattern_name])
+ self.unitless_patterns[pattern_name] = {"name": name, **pattern}
# NUMBER PATTERNS
one_plus = "[1-9][0-9]*"
@@ -423,33 +605,15 @@ def __init__(
},
)
- # MEASURES
- for measure_config in measurements:
- name = measure_config["name"]
- unit = measure_config["unit"]
- self.measure_names[self.unit_registry.parse_unit(unit)[0]] = name
- if "unitless_patterns" in measure_config:
- for pattern in measure_config["unitless_patterns"]:
- pattern_name = f"unitless_{len(self.unitless_patterns)}"
- self.term_matcher.build_patterns(
- nlp,
- terms={
- pattern_name: pattern["terms"],
- },
- )
- self.unitless_label_hashes.add(nlp.vocab.strings[pattern_name])
- self.unitless_patterns[pattern_name] = {"name": name, **pattern}
-
- self.set_extensions()
-
- @classmethod
- def set_extensions(cls) -> None:
+ def set_extensions(self) -> None:
"""
Set extensions for the measurements pipeline.
"""
+ super().set_extensions()
- if not Span.has_extension("value"):
- Span.set_extension("value", default=None)
+ for name in self.measure_names.values():
+ if not Span.has_extension(name):
+ Span.set_extension(name, default=None)
def extract_units(self, term_matches: Iterable[Span]) -> Iterable[Span]:
"""
@@ -476,15 +640,15 @@ def extract_units(self, term_matches: Iterable[Span]) -> Iterable[Span]:
if unit_part.label not in self.unit_part_label_hashes:
continue
if last is not None and (
- (
- unit_part.doc[last.end : unit_part.start].text.strip() != ""
- and len(current)
- )
- or (
- not self.compose_units
- and len(current)
- and current[-1].label_ != "per"
- )
+ (
+ unit_part.doc[last.end: unit_part.start].text.strip() != ""
+ and len(current)
+ )
+ or (
+ not self.compose_units
+ and len(current)
+ and current[-1].label_ != "per"
+ )
):
doc = current[0].doc
# Last non "per" match: we don't want our units to be like `g_per`
@@ -514,10 +678,10 @@ def extract_units(self, term_matches: Iterable[Span]) -> Iterable[Span]:
@classmethod
def make_pseudo_sentence(
- cls,
- doclike: Union[Doc, Span],
- matches: List[Tuple[Span, bool]],
- pseudo_mapping: Dict[int, str],
+ cls,
+ doclike: Union[Doc, Span],
+ matches: List[Tuple[Span, bool]],
+ pseudo_mapping: Dict[int, str],
) -> Tuple[str, List[int]]:
"""
Creates a pseudo sentence (one letter per entity)
@@ -546,14 +710,14 @@ def make_pseudo_sentence(
offsets = []
for ent, is_sent_split in matches:
if (
- ent.start != last
- and not doclike.doc[last : ent.start].text.strip() == ""
+ ent.start != last
+ and not doclike.doc[last: ent.start].text.strip() == ""
):
pseudo.append("w")
offsets.append(len(pseudo))
pseudo.append(pseudo_mapping.get(ent.label, "." if is_sent_split else "w"))
last = ent.end
- if snippet.end != last and doclike.doc[last : snippet.end].text.strip() == "":
+ if snippet.end != last and doclike.doc[last: snippet.end].text.strip() == "":
pseudo.append("w")
pseudo = "".join(pseudo)
@@ -575,7 +739,7 @@ def get_matches(self, doc):
- List of tuples of spans and whether the spans represents a sentence end
- List of hash label to distinguish unit from other matches
"""
- sent_ends = [doc[i : i + 1] for i in range(len(doc)) if doc[i].is_sent_end]
+ sent_ends = [doc[i: i + 1] for i in range(len(doc)) if doc[i].is_sent_end]
regex_matches = list(self.regex_matcher(doc, as_spans=True))
term_matches = list(self.term_matcher(doc, as_spans=True))
@@ -638,7 +802,7 @@ def extract_measurements(self, doclike: Doc):
# Make match slice function to query them
def get_matches_after(i):
anchor = matches[i][0]
- for j, (ent, is_sent_end) in enumerate(matches[i + 1 :]):
+ for j, (ent, is_sent_end) in enumerate(matches[i + 1:]):
if not is_sent_end and ent.start > anchor.end + AFTER_SNIPPET_LIMIT:
return
yield j + i + 1, ent
@@ -705,7 +869,7 @@ def get_matches_before(i):
# Try to pair the number with this next unit if the two are only separated
# by numbers and separators alternatively (as in [1][,] [2] [and] [3] cm)
try:
- pseudo_sent = pseudo[offsets[number_idx] + 1 : offsets[unit_idx]]
+ pseudo_sent = pseudo[offsets[number_idx] + 1: offsets[unit_idx]]
if not re.fullmatch(r"(,n)*", pseudo_sent):
unit_text, unit_norm = None, None
except TypeError:
@@ -732,8 +896,8 @@ def get_matches_before(i):
)
unit_norm = None
if re.fullmatch(
- r"[,:n]*",
- pseudo[offsets[unitless_idx] + 1 : offsets[number_idx]],
+ r"[,:n]*",
+ pseudo[offsets[unitless_idx] + 1: offsets[number_idx]],
):
unitless_pattern = self.unitless_patterns[unitless_text.label_]
unit_norm = next(
@@ -753,17 +917,17 @@ def get_matches_before(i):
# TODO: handle this part better without .text.strip(), with cases for
# stopwords, etc
if (
- unit_text
- and number.start <= unit_text.end
- and doc[number.end : unit_text.start].text.strip() == ""
+ unit_text
+ and number.start <= unit_text.end
+ and doc[number.end: unit_text.start].text.strip() == ""
):
- ent = doc[number.start : unit_text.end]
+ ent = doc[number.start: unit_text.end]
elif (
- unit_text
- and unit_text.start <= number.end
- and doc[unit_text.end : number.start].text.strip() == ""
+ unit_text
+ and unit_text.start <= number.end
+ and doc[unit_text.end: number.start].text.strip() == ""
):
- ent = doc[unit_text.start : number.end]
+ ent = doc[unit_text.start: number.end]
else:
ent = number
@@ -778,8 +942,11 @@ def get_matches_before(i):
if dims not in self.measure_names:
continue
- ent._.value = SimpleMeasurement(value, unit_norm, self.unit_registry)
ent.label_ = self.measure_names[dims]
+ ent._.set(
+ self.measure_names[dims],
+ SimpleMeasurement(value, unit_norm, self.unit_registry)
+ )
measurements.append(ent)
@@ -791,10 +958,10 @@ def get_matches_before(i):
unmatched = []
for idx, (match, _) in enumerate(matches):
if (
- match.label in unit_label_hashes
- and idx not in matched_unit_indices
- or match.label in self.number_label_hashes
- and idx not in matched_number_indices
+ match.label in unit_label_hashes
+ and idx not in matched_unit_indices
+ or match.label in self.number_label_hashes
+ and idx not in matched_number_indices
):
unmatched.append(match)
@@ -823,9 +990,9 @@ def merge_adjacent_measurements(cls, measurements: List[Span]) -> List[Span]:
if last.end == ent.start and last._.value.unit != ent._.value.unit:
try:
new_value = last._.value + ent._.value
- merged[-1] = last = last.doc[last.start : ent.end]
- last._.value = new_value
+ merged[-1] = last = last.doc[last.start: ent.end]
last.label_ = ent.label_
+ last._.set(last.label_, new_value)
except (AttributeError, TypeError):
merged.append(ent)
else:
@@ -856,7 +1023,7 @@ def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]:
last = merged[-1]
from_text = last.doc[last.start - 1].norm_ if last.start > 0 else None
- to_text = get_text(last.doc[last.end : ent.start], "NORM", True)
+ to_text = get_text(last.doc[last.end: ent.start], "NORM", True)
matching_patterns = [
(a, b)
for a, b in self.range_patterns
@@ -868,12 +1035,12 @@ def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]:
last._.value, ent._.value
)
merged[-1] = last = last.doc[
- last.start
- if matching_patterns[0][0] is None
- else last.start - 1 : ent.end
- ]
+ last.start
+ if matching_patterns[0][0] is None
+ else last.start - 1: ent.end
+ ]
last.label_ = ent.label_
- last._.value = new_value
+ last._.set(last.label_, new_value)
except (AttributeError, TypeError):
merged.append(ent)
else:
@@ -882,9 +1049,9 @@ def merge_measurements_in_ranges(self, measurements: List[Span]) -> List[Span]:
return merged
def merge_with_existing(
- self,
- extracted: List[Span],
- existing: List[Span],
+ self,
+ extracted: List[Span],
+ existing: List[Span],
) -> List[Span]:
"""
Merges the extracted measurements with the existing measurements in the
@@ -907,7 +1074,7 @@ def merge_with_existing(
extracted = []
for span, span_measurements in zip(existing, spans_measurements):
if len(span_measurements):
- span._.value = span_measurements[0]._.value
+ span._.set(span.label_, span_measurements[0]._.get(span.label_))
extracted.append(span)
elif self.merge_mode == "intersect":
@@ -917,9 +1084,6 @@ def merge_with_existing(
extracted.extend(span_measurements)
extracted = list(dict.fromkeys(extracted))
- else:
- extracted = [*extracted, *existing]
-
return extracted
def __call__(self, doc):
@@ -936,33 +1100,21 @@ def __call__(self, doc):
doc:
spaCy Doc object, annotated for extracted measurements.
"""
- ent_labels = set(self.measure_names.values())
- existing = [
- ent
- for ent in (*doc.ents, *doc.spans.get(self.name, ()))
- if ent.label_ in ent_labels
- ]
- other_ents = [ent for ent in doc.ents if ent.label_ not in ent_labels]
+ existing = (
+ list(get_spans(doc, self.span_getter))
+ if self.span_getter is not None
+ else ())
snippets = (
dict.fromkeys(ent.sent for ent in existing)
- if self.merge_mode in ("intersect", "align")
+ if self.span_getter is not None
else [doc]
)
measurements = [m for s in snippets for m in self.extract_measurements(s)[0]]
measurements = self.merge_adjacent_measurements(measurements)
measurements = self.merge_measurements_in_ranges(measurements)
- measurements = self.merge_with_existing(measurements, existing)
-
- if self.as_ents:
- doc.ents = filter_spans((*other_ents, *measurements))
-
- doc.spans[self.name] = dedup(
- (*measurements, *doc.spans.get(self.name, ())),
- key=lambda x: (x.start, x.end, x.label_),
- )
+ if self.span_getter is not None:
+ measurements = self.merge_with_existing(measurements, existing)
- # for backward compatibility
- if self.name == "measurements":
- doc.spans["measures"] = doc.spans["measurements"]
+ self.set_spans(doc, measurements)
return doc
diff --git a/edsnlp/pipelines/misc/measurements/patterns.py b/edsnlp/pipelines/misc/measurements/patterns.py
index daa69df8f..0d565a156 100644
--- a/edsnlp/pipelines/misc/measurements/patterns.py
+++ b/edsnlp/pipelines/misc/measurements/patterns.py
@@ -549,7 +549,7 @@
common_measurements = {
- "eds.weight": {
+ "weight": {
"unit": "kg",
"unitless_patterns": [
{
@@ -561,7 +561,7 @@
}
],
},
- "eds.size": {
+ "size": {
"unit": "m",
"unitless_patterns": [
{
@@ -583,13 +583,13 @@
}
],
},
- "eds.bmi": {
+ "bmi": {
"unit": "kg_per_m2",
"unitless_patterns": [
{"terms": ["imc", "bmi"], "ranges": [{"unit": "kg_per_m2"}]}
],
},
- "eds.volume": {"unit": "m3", "unitless_patterns": []},
+ "volume": {"unit": "m3", "unitless_patterns": []},
}
unit_divisors = ["/", "par"]
diff --git a/edsnlp/pipelines/misc/reason/__init__.py b/edsnlp/pipelines/misc/reason/__init__.py
index 7e9f5c1d9..628417b82 100644
--- a/edsnlp/pipelines/misc/reason/__init__.py
+++ b/edsnlp/pipelines/misc/reason/__init__.py
@@ -1,2 +1,2 @@
from .patterns import reasons
-from .reason import Reason
+from .reason import ReasonMatcher
diff --git a/edsnlp/pipelines/misc/reason/factory.py b/edsnlp/pipelines/misc/reason/factory.py
index 1b4c8794d..0d05aef71 100644
--- a/edsnlp/pipelines/misc/reason/factory.py
+++ b/edsnlp/pipelines/misc/reason/factory.py
@@ -1,10 +1,9 @@
-from typing import Dict, List, Optional, Union
-
from spacy.language import Language
-from edsnlp.pipelines.misc.reason import Reason
from edsnlp.utils.deprecation import deprecated_factory
+from .reason import ReasonMatcher
+
DEFAULT_CONFIG = dict(
reasons=None,
attr="TEXT",
@@ -12,21 +11,12 @@
ignore_excluded=False,
)
-
-@deprecated_factory("reason", "eds.reason", default_config=DEFAULT_CONFIG)
-@Language.factory("eds.reason", default_config=DEFAULT_CONFIG)
-def create_component(
- nlp: Language,
- name: str,
- reasons: Optional[Dict[str, Union[List[str], str]]],
- attr: str,
- use_sections: bool,
- ignore_excluded: bool,
-):
- return Reason(
- nlp,
- reasons=reasons,
- attr=attr,
- use_sections=use_sections,
- ignore_excluded=ignore_excluded,
- )
+create_component = deprecated_factory(
+ "reason",
+ "eds.reason",
+ assigns=["doc.spans", "doc.ents"],
+)(ReasonMatcher)
+create_component = Language.factory(
+ "eds.reason",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/reason/reason.py b/edsnlp/pipelines/misc/reason/reason.py
index 5aaaa0a38..9bc840036 100644
--- a/edsnlp/pipelines/misc/reason/reason.py
+++ b/edsnlp/pipelines/misc/reason/reason.py
@@ -4,45 +4,115 @@
from spacy.language import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.core.matcher import GenericMatcher
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
from edsnlp.pipelines.misc.reason import patterns
from edsnlp.utils.filter import get_spans
from edsnlp.utils.inclusion import check_inclusion
-class Reason(GenericMatcher):
- """Pipeline to identify the reason of the hospitalisation.
+class ReasonMatcher(GenericMatcher):
+ '''
+ The `eds.reason` matcher uses a rule-based algorithm to detect spans that relate
+ to the reason of the hospitalisation. It was designed at AP-HP's EDS.
+
+ Examples
+ --------
+ The following snippet matches a simple terminology, and looks for spans of
+ hospitalisation reasons. It is complete and can be run _as is_.
+
+ ```python
+ import spacy
+
+ text = """COMPTE RENDU D'HOSPITALISATION du 11/07/2018 au 12/07/2018
+ MOTIF D'HOSPITALISATION
+ Monsieur Dupont Jean Michel, de sexe masculin, âgée de 39 ans, née le 23/11/1978,
+ a été hospitalisé du 11/08/2019 au 17/08/2019 pour attaque d'asthme.
+
+ ANTÉCÉDENTS
+ Antécédents médicaux :
+ Premier épisode d'asthme en mai 2018."""
+
+ nlp = spacy.blank("eds")
+
+ # Extraction of entities
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(
+ terms=dict(
+ respiratoire=[
+ "asthmatique",
+ "asthme",
+ "toux",
+ ]
+ )
+ ),
+ )
+
+
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.reason", config=dict(use_sections=True))
+ doc = nlp(text)
+
+ reason = doc.spans["reasons"][0]
+ reason
+ # Out: hospitalisé du 11/08/2019 au 17/08/2019 pour attaque d'asthme.
+
+ reason._.is_reason
+ # Out: True
+
+ entities = reason._.ents_reason
+ entities
+ # Out: [asthme]
- It declares a Span extension called `ents_reason` and adds
- the key `reasons` to doc.spans.
+ entities[0].label_
+ # Out: 'respiratoire'
- It also declares the boolean extension `is_reason`.
- This extension is set to True for the Reason Spans but also
- for the entities that overlap the reason span.
+ ent = entities[0]
+ ent._.is_reason
+ # Out: True
+ ```
+
+ Extensions
+ ----------
+ The `eds.reason` pipeline adds the key `reasons` to `doc.spans` and declares
+ one extension, on the `Span` objects called `ents_reason`.
+
+ The `ents_reason` extension is a list of named entities that overlap the `Span`,
+ typically entities found in upstream components like `matcher`.
+
+ It also declares the boolean extension `is_reason`. This extension is set to True
+ for the Reason Spans but also for the entities that overlap the reason span.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
- reasons : Optional[Dict[str, Union[List[str], str]]]
- The terminology of reasons.
+ The pipeline object
+ name : str
+ Name of the component
+ reasons : Dict[str, Union[List[str], str]]
+ Reason patterns
attr : str
- spaCy's attribute to use:
- a string with the value "TEXT" or "NORM", or a dict with
- the key 'term_attr'. We can also add a key for each regex.
+ Default token attribute to use to build the text to match on.
use_sections : bool,
- whether or not use the `sections` pipeline to improve results.
+ Whether or not use the `sections` matcher to improve results.
ignore_excluded : bool
Whether to skip excluded tokens.
- """
+
+ Authors and citation
+ --------------------
+ The `eds.reason` matcher was developed by AP-HP's Data Science team.
+
+ '''
def __init__(
self,
nlp: Language,
- reasons: Optional[Dict[str, Union[List[str], str]]],
- attr: Union[Dict[str, str], str],
- use_sections: bool,
- ignore_excluded: bool,
+ name: str = "eds.reason",
+ *,
+ reasons: Optional[Dict[str, Union[List[str], str]]] = None,
+ attr: Union[Dict[str, str], str] = "TEXT",
+ use_sections: bool = False,
+ ignore_excluded: bool = False,
):
if reasons is None:
@@ -50,10 +120,12 @@ def __init__(
super().__init__(
nlp,
+ name=name,
terms=None,
regex=reasons,
attr=attr,
ignore_excluded=ignore_excluded,
+ span_setter={},
)
self.use_sections = use_sections and (
@@ -74,6 +146,7 @@ def set_extensions(cls) -> None:
if not Span.has_extension("ents_reason"):
Span.set_extension("ents_reason", default=None)
+ # TODO: remove this extension, and filter directly in span group
if not Span.has_extension("is_reason"):
Span.set_extension("is_reason", default=False)
diff --git a/edsnlp/pipelines/misc/sections/__init__.py b/edsnlp/pipelines/misc/sections/__init__.py
index a7050814a..5c4947222 100644
--- a/edsnlp/pipelines/misc/sections/__init__.py
+++ b/edsnlp/pipelines/misc/sections/__init__.py
@@ -1 +1,3 @@
-from .sections import Sections
+from .sections import SectionsMatcher
+
+Sections = SectionsMatcher
diff --git a/edsnlp/pipelines/misc/sections/factory.py b/edsnlp/pipelines/misc/sections/factory.py
index 2c61cc716..f2ad91b79 100644
--- a/edsnlp/pipelines/misc/sections/factory.py
+++ b/edsnlp/pipelines/misc/sections/factory.py
@@ -1,33 +1,23 @@
-from typing import Dict, List, Optional
-
from spacy.language import Language
from edsnlp.utils.deprecation import deprecated_factory
-from . import Sections
+from .patterns import sections
+from .sections import SectionsMatcher
DEFAULT_CONFIG = dict(
- sections=None,
+ sections=sections,
add_patterns=True,
attr="NORM",
ignore_excluded=True,
)
-
-@deprecated_factory("sections", "eds.sections", default_config=DEFAULT_CONFIG)
-@Language.factory("eds.sections", default_config=DEFAULT_CONFIG)
-def create_component(
- nlp: Language,
- name: str,
- sections: Optional[Dict[str, List[str]]],
- add_patterns: bool,
- attr: str,
- ignore_excluded: bool,
-):
- return Sections(
- nlp,
- sections=sections,
- add_patterns=add_patterns,
- attr=attr,
- ignore_excluded=ignore_excluded,
- )
+create_component = deprecated_factory(
+ "sections",
+ "eds.sections",
+ assigns=["doc.spans", "doc.ents"],
+)(SectionsMatcher)
+create_component = Language.factory(
+ "eds.sections",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/sections/sections.py b/edsnlp/pipelines/misc/sections/sections.py
index 4eb4ceae6..2f59eedf9 100644
--- a/edsnlp/pipelines/misc/sections/sections.py
+++ b/edsnlp/pipelines/misc/sections/sections.py
@@ -5,69 +5,120 @@
from spacy.tokens import Doc, Span
from spacy.util import filter_spans
-from edsnlp.pipelines.core.matcher import GenericMatcher
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
from . import patterns
-class Sections(GenericMatcher):
+class SectionsMatcher(GenericMatcher):
+ '''
+ The `eds.sections` component extracts section titles from clinical documents.
+ A "section" is then defined as the span of text between two titles.
+
+ Here is the list of sections that are currently targeted :
+
+ - `allergies`
+ - `antécédents`
+ - `antécédents familiaux`
+ - `traitements entrée`
+ - `conclusion`
+ - `conclusion entrée`
+ - `habitus`
+ - `correspondants`
+ - `diagnostic`
+ - `données biométriques entrée`
+ - `examens`
+ - `examens complémentaires`
+ - `facteurs de risques`
+ - `histoire de la maladie`
+ - `actes`
+ - `motif`
+ - `prescriptions`
+ - `traitements sortie`
+ - `evolution`
+ - `modalites sortie`
+ - `vaccinations`
+ - `introduction`
+
+
+
+ Remarks :
+
+ - section `introduction` corresponds to the span of text between the header
+ "COMPTE RENDU D'HOSPITALISATION" (usually denoting the beginning of the document)
+ and the title of the following detected section
+ - this matcher works well for hospitalization summaries (CRH), but not necessarily
+ for all types of documents (in particular for emergency or scan summaries
+ CR-IMAGERIE)
+
+ !!! warning "Experimental"
+
+ Should you rely on `eds.sections` for critical downstream tasks, make sure to
+ validate the results to make sure that the component works in your case.
+
+ Examples
+ --------
+ The following snippet detects section titles. It is complete and can be run _as is_.
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.sections")
+
+ text = """
+ CRU du 10/09/2021
+ Motif :
+ Patient admis pour suspicion de COVID
"""
- Divides the document into sections.
-
- By default, we are using a dataset of documents annotated for section titles,
- using the work done by Ivan Lerner, reviewed by Gilles Chatellier.
-
- Detected sections are :
-
- - allergies ;
- - antécédents ;
- - antécédents familiaux ;
- - traitements entrée ;
- - conclusion ;
- - conclusion entrée ;
- - habitus ;
- - correspondants ;
- - diagnostic ;
- - données biométriques entrée ;
- - examens ;
- - examens complémentaires ;
- - facteurs de risques ;
- - histoire de la maladie ;
- - actes ;
- - motif ;
- - prescriptions ;
- - traitements sortie.
-
- The component looks for section titles within the document,
- and stores them in the `section_title` extension.
-
- For ease-of-use, the component also populates a `section` extension,
- which contains a list of spans corresponding to the "sections" of the
- document. These span from the start of one section title to the next,
- which can introduce obvious bias should an intermediate section title
- goes undetected.
+
+ doc = nlp(text)
+
+ doc.spans["section_titles"]
+ # Out: [Motif]
+ ```
+
+ Extensions
+ ----------
+ The `eds.sections` matcher adds two fields to the `doc.spans` attribute :
+
+ 1. The `section_titles` key contains the list of all section titles extracted using
+ the list declared in the `terms.py` module.
+ 2. The `sections` key contains a list of sections, ie spans of text between two
+ section titles (or the last title and the end of the document).
+
+ If the document has entities before calling this matcher an attribute `section`
+ is added to each entity.
Parameters
----------
nlp : Language
- spaCy pipeline object.
+ The pipeline object.
sections : Dict[str, List[str]]
Dictionary of terms to look for.
attr : str
Default attribute to match on.
+ add_patterns : bool
+ Whether add update patterns to match start / end of lines
ignore_excluded : bool
Whether to skip excluded tokens.
- """
+
+ Authors and citation
+ --------------------
+ The `eds.sections` matcher was developed by AP-HP's Data Science team.
+ '''
def __init__(
self,
nlp: Language,
- sections: Dict[str, List[str]],
- add_patterns: bool,
- attr: str,
- ignore_excluded: bool,
+ name: str = "eds.sections",
+ *,
+ sections: Dict[str, List[str]] = patterns.sections,
+ add_patterns: bool = True,
+ attr: str = "NORM",
+ ignore_excluded: bool = True,
):
-
logger.warning(
"The component Sections is still in Beta. Use at your own risks."
)
@@ -80,17 +131,18 @@ def __init__(
if add_patterns:
for k, v in sections.items():
-
sections[k] = [
r"(?<=(?:\n|^)[^\n]{0,5})" + ent + r"(?=[^\n]{0,5}\n)" for ent in v
]
super().__init__(
nlp,
+ name=name,
terms=None,
regex=sections,
attr=attr,
ignore_excluded=ignore_excluded,
+ span_setter={},
)
self.set_extensions()
diff --git a/edsnlp/pipelines/misc/tables/factory.py b/edsnlp/pipelines/misc/tables/factory.py
index 047c2a55f..86b316366 100644
--- a/edsnlp/pipelines/misc/tables/factory.py
+++ b/edsnlp/pipelines/misc/tables/factory.py
@@ -1,5 +1,3 @@
-from typing import Dict, List, Optional, Union
-
from spacy.language import Language
from edsnlp.pipelines.misc.tables import TablesMatcher
@@ -12,21 +10,13 @@
ignore_excluded=True,
)
-
-@deprecated_factory("tables", "eds.tables", default_config=DEFAULT_CONFIG)
-@Language.factory("eds.tables", default_config=DEFAULT_CONFIG)
-def create_component(
- nlp: Language,
- name: str,
- tables_pattern: Optional[Dict[str, Union[List[str], str]]],
- sep_pattern: Optional[str],
- attr: str,
- ignore_excluded: bool,
-):
- return TablesMatcher(
- nlp,
- tables_pattern=tables_pattern,
- sep_pattern=sep_pattern,
- attr=attr,
- ignore_excluded=ignore_excluded,
- )
+create_component = TablesMatcher
+create_component = deprecated_factory(
+ "tables",
+ "eds.tables",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
+create_component = Language.factory(
+ "eds.tables",
+ assigns=["doc.spans", "doc.ents"],
+)(create_component)
diff --git a/edsnlp/pipelines/misc/tables/tables.py b/edsnlp/pipelines/misc/tables/tables.py
index 106f63216..fc2f6905c 100644
--- a/edsnlp/pipelines/misc/tables/tables.py
+++ b/edsnlp/pipelines/misc/tables/tables.py
@@ -5,20 +5,100 @@
from spacy.language import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.core.matcher import GenericMatcher
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
from edsnlp.pipelines.misc.tables import patterns
from edsnlp.utils.filter import get_spans
class TablesMatcher(GenericMatcher):
- """Pipeline to identify the Tables.
+ '''
+ The `eds.tables` matcher detects tables in a documents.
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.tables")
+
+ text = """
+ SERVICE
+ MEDECINE INTENSIVE –
+ REANIMATION
+ Réanimation / Surveillance Continue
+ Médicale
+
+ COMPTE RENDU D'HOSPITALISATION du 05/06/2020 au 10/06/2020
+ Madame DUPONT Marie, née le 16/05/1900, âgée de 20 ans, a été hospitalisée en
+ réanimation du 05/06/1920 au 10/06/1920 pour intoxication médicamenteuse volontaire.
+
+ Examens complémentaires
+ Hématologie
+ Numération
+ Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
+ Hématies ¦x10*12/L¦4.68 ¦4.53-5.79
+ Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7
+ Hématocrite ¦% ¦44.2 ¦39.2-48.6
+ VGM ¦fL ¦94.4 + ¦79.6-94
+ TCMH ¦pg ¦31.6 ¦27.3-32.8
+ CCMH ¦g/dL ¦33.5 ¦32.4-36.3
+ Plaquettes ¦x10*9/L ¦191 ¦172-398
+ VMP ¦fL ¦11.5 + ¦7.4-10.8
+
+ Sur le plan neurologique : Devant la persistance d'une confusion à distance de
+ l'intoxication au
+ ...
+
+ 2/2Pat : |F | | |Intitulé RCP
+ """
- It adds the key `tables` to doc.spans.
+ doc = nlp(text)
+
+ # A table span
+ table = doc.spans["tables"][0]
+
+ # Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
+ # Hématies ¦x10*12/L¦4.68 ¦4.53-5.79
+ # Hémoglobine ¦g/dL ¦14.8 ¦13.4-16.7
+ # Hématocrite ¦% ¦44.2 ¦39.2-48.6
+ # VGM ¦fL ¦94.4 + ¦79.6-94
+ # TCMH ¦pg ¦31.6 ¦27.3-32.8
+ # CCMH ¦g/dL ¦33.5 ¦32.4-36.3
+ # Plaquettes ¦x10*9/L ¦191 ¦172-398
+ # VMP ¦fL ¦11.5 + ¦7.4-10.8
+
+ # Convert span to Pandas table
+ df = table._.to_pd_table()
+ type(df)
+ # Out: pandas.core.frame.DataFrame
+ ```
+ The pandas DataFrame:
+
+ | | 0 | 1 | 2 | 3 |
+ | ---: | :---------- | :------- | :----- | :-------- |
+ | 0 | Leucocytes | x10*9/L | 4.97 | 4.09-11 |
+ | 1 | Hématies | x10*12/L | 4.68 | 4.53-5.79 |
+ | 2 | Hémoglobine | g/dL | 14.8 | 13.4-16.7 |
+ | 3 | Hématocrite | % | 44.2 | 39.2-48.6 |
+ | 4 | VGM | fL | 94.4 + | 79.6-94 |
+ | 5 | TCMH | pg | 31.6 | 27.3-32.8 |
+ | 6 | CCMH | g/dL | 33.5 | 32.4-36.3 |
+ | 7 | Plaquettes | x10*9/L | 191 | 172-398 |
+ | 8 | VMP | fL | 11.5 + | 7.4-10.8 |
+
+ Extensions
+ ----------
+ The `eds.tables` pipeline declares the `span._.to_pd_table()` Span extension.
+ This function returns a parsed pandas version of the table.
Parameters
----------
nlp : Language
spaCy nlp pipeline to use for matching.
+ name: str
+ Name of the component.
tables_pattern : Optional[Dict[str, str]]
The regex pattern to identify tables.
The key of dictionary should be `tables`
@@ -28,17 +108,22 @@ class TablesMatcher(GenericMatcher):
the key 'term_attr'. We can also add a key for each regex.
ignore_excluded : bool
Whether to skip excluded tokens.
- """
+
+ Authors and citation
+ --------------------
+ The `eds.tables` pipeline was developed by AP-HP's Data Science team.
+ '''
def __init__(
self,
nlp: Language,
- tables_pattern: Optional[Dict[str, str]],
- sep_pattern: Optional[str],
- attr: Union[Dict[str, str], str],
- ignore_excluded: bool,
+ name: str = "eds.tables",
+ *,
+ tables_pattern: Optional[Dict[str, str]] = None,
+ sep_pattern: Optional[str] = None,
+ attr: Union[Dict[str, str], str] = "TEXT",
+ ignore_excluded: bool = True,
):
-
if tables_pattern is None:
self.tables_pattern = patterns.regex
else:
@@ -50,7 +135,8 @@ def __init__(
self.sep = sep_pattern
super().__init__(
- nlp,
+ nlp=nlp,
+ name=name,
terms=None,
regex=self.tables_pattern,
attr=attr,
diff --git a/edsnlp/pipelines/ner/adicap/adicap.py b/edsnlp/pipelines/ner/adicap/adicap.py
index f1426f247..133322b44 100644
--- a/edsnlp/pipelines/ner/adicap/adicap.py
+++ b/edsnlp/pipelines/ner/adicap/adicap.py
@@ -1,26 +1,152 @@
"""`eds.adicap` pipeline"""
import re
+from typing import List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.core.contextual_matcher import ContextualMatcher
-from edsnlp.utils.filter import filter_spans
from edsnlp.utils.resources import get_adicap_dict
from . import patterns
from .models import AdicapCode
-class Adicap(ContextualMatcher):
- def __init__(self, nlp, pattern, attr, prefix, window):
+# noinspection SpellCheckingInspection
+class AdicapMatcher(ContextualMatcher):
+ """
+ The `eds.adicap` pipeline component matches the ADICAP codes. It was developped to
+ run on anapathology reports.
- self.nlp = nlp
- if pattern is None:
- pattern = patterns.base_code
-
- if prefix is None:
- prefix = patterns.adicap_prefix
+ !!! warning "Document type"
+ It was developped to work on anapathology reports.
+ We recommend also to use the `eds` language (`spacy.blank("eds")`)
+
+ The compulsory characters of the ADICAP code are identified and decoded.
+ These characters represent the following attributes:
+
+ | Field [en] | Field [fr] | Attribute |
+ |-------------------|-------------------------------|-----------------|
+ | Sampling mode | Mode de prelevement | sampling_mode |
+ | Technic | Type de technique | technic |
+ | Organ and regions | Appareils, organes et régions | organ |
+ | Pathology | Pathologie générale | pathology |
+ | Pathology type | Type de la pathologie | pathology_type |
+ | Behaviour type | Type de comportement | behaviour_type |
+
+
+ The pathology field takes 4 different values corresponding to the 4 possible
+ interpretations of the ADICAP code, which are : "PATHOLOGIE GÉNÉRALE NON TUMORALE",
+ "PATHOLOGIE TUMORALE", "PATHOLOGIE PARTICULIERE DES ORGANES" and "CYTOPATHOLOGIE".
+
+ Depending on the pathology value the behaviour type meaning changes, when the
+ pathology is tumoral then it describes the malignancy of the tumor.
+
+ For further details about the ADICAP code follow this [link](https://smt.esante.\
+gouv.fr/wp-json/ans/terminologies/document?terminologyId=terminologie-adicap&file\
+Name=cgts_sem_adicap_fiche-detaillee.pdf).
+
+ Examples
+ --------
+ ```{ .python .no-check }
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe("eds.adicap")
+
+ text = \"\"\"
+ COMPTE RENDU D’EXAMEN
+
+ Antériorité(s) : NEANT
+
+
+ Renseignements cliniques :
+ Contexte d'exploration d'un carcinome canalaire infiltrant du quadrant supéro-
+ externe du sein droit. La lésion biopsiée ce jour est située à 5,5 cm de la lésion
+ du quadrant supéro-externe, à l'union des quadrants inférieurs.
+
+
+ Macrobiopsie 10G sur une zone de prise de contraste focale à l'union des quadrants
+ inférieurs du sein droit, mesurant 4 mm, classée ACR4
+
+ 14 fragments ont été communiqués fixés en formol (lame n° 1a et lame n° 1b) . Il
+ n'y a pas eu d'échantillon congelé. Ces fragments ont été inclus en paraffine en
+ totalité et coupés sur plusieurs niveaux.
+ Histologiquement, il s'agit d'un parenchyme mammaire fibroadipeux parfois
+ légèrement dystrophique avec quelques petits kystes. Il n'y a pas d'hyperplasie
+ épithéliale, pas d'atypie, pas de prolifération tumorale. On note quelques
+ suffusions hémorragiques focales.
+
+ Conclusion :
+ Légers remaniements dystrophiques à l'union des quadrants inférieurs du sein droit.
+ Absence d'atypies ou de prolifération tumorale.
+
+ Codification : BHGS0040
+ \"\"\"
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (BHGS0040,)
+
+ ent = doc.ents[0]
+
+ ent.label_
+ # Out: adicap
+
+ ent._.adicap.dict()
+ # Out: {'code': 'BHGS0040',
+ # 'sampling_mode': 'BIOPSIE CHIRURGICALE',
+ # 'technic': 'HISTOLOGIE ET CYTOLOGIE PAR INCLUSION',
+ # 'organ': "SEIN (ÉGALEMENT UTILISÉ CHEZ L'HOMME)",
+ # 'pathology': 'PATHOLOGIE GÉNÉRALE NON TUMORALE',
+ # 'pathology_type': 'ETAT SUBNORMAL - LESION MINEURE',
+ # 'behaviour_type': 'CARACTERES GENERAUX'}
+ ```
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : str
+ The name of the pipe
+ pattern : Optional[Union[List[str], str]]
+ The regex pattern to use for matching ADICAP codes
+ prefix : Optional[Union[List[str], str]]
+ The regex pattern to use for matching the prefix before ADICAP codes
+ window : int
+ Number of tokens to look for prefix. It will never go further the start of
+ the sentence
+ attr : str
+ Attribute to match on, eg `TEXT`, `NORM`, etc.
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.adicap` pipeline was developed by AP-HP's Data Science team.
+ The codes were downloaded from the website of 'Agence du numérique en santé'
+ ("Thésaurus de la codification ADICAP - Index raisonné des
+ lésions", [@terminologie-adicap])
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language],
+ name: str = "eds.adicap",
+ *,
+ pattern: Union[List[str], str] = patterns.base_code,
+ prefix: Union[List[str], str] = patterns.adicap_prefix,
+ window: int = 500,
+ attr: str = "TEXT",
+ label: str = "adicap",
+ span_setter: SpanSetterArg = {"ents": True, "adicap": True},
+ ):
adicap_pattern = dict(
source="adicap",
regex=prefix,
@@ -38,7 +164,8 @@ def __init__(self, nlp, pattern, attr, prefix, window):
super().__init__(
nlp=nlp,
- name="adicap",
+ name=name,
+ label=label,
attr=attr,
patterns=adicap_pattern,
ignore_excluded=False,
@@ -46,19 +173,17 @@ def __init__(self, nlp, pattern, attr, prefix, window):
alignment_mode="expand",
include_assigned=False,
assign_as_span=False,
+ span_setter=span_setter,
)
self.decode_dict = get_adicap_dict()
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
+ def set_extensions(self) -> None:
super().set_extensions()
- if not Span.has_extension("adicap"):
- Span.set_extension("adicap", default=None)
- if not Span.has_extension("value"):
- Span.set_extension("value", default=None)
+ if not Span.has_extension(self.label):
+ Span.set_extension(self.label, default=None)
def decode(self, code):
code = re.sub("[^A-Za-z0-9 ]+", "", code)
@@ -86,7 +211,7 @@ def decode(self, code):
return adicap
- def __call__(self, doc: Doc) -> Doc:
+ def process(self, doc: Doc) -> List[Span]:
"""
Tags ADICAP mentions.
@@ -100,22 +225,6 @@ def __call__(self, doc: Doc) -> Doc:
doc : Doc
spaCy Doc object, annotated for ADICAP
"""
- spans = self.process(doc)
- spans = filter_spans(spans)
-
- for span in spans:
- span._.adicap = self.decode(span._.assigned["code"])
- span._.value = span._.adicap
- span._.assigned = None
-
- doc.spans["adicap"] = spans
-
- ents, discarded = filter_spans(list(doc.ents) + spans, return_discarded=True)
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
-
- return doc
+ for span in super().process(doc):
+ span._.set(self.label, self.decode(span._.assigned["code"]))
+ yield span
diff --git a/edsnlp/pipelines/ner/adicap/factory.py b/edsnlp/pipelines/ner/adicap/factory.py
index 78ade41dd..5e8772235 100644
--- a/edsnlp/pipelines/ner/adicap/factory.py
+++ b/edsnlp/pipelines/ner/adicap/factory.py
@@ -1,49 +1,18 @@
-from typing import List, Optional, Union
-
from spacy.language import Language
-from .adicap import Adicap
+from .adicap import AdicapMatcher
from .patterns import adicap_prefix, base_code
DEFAULT_CONFIG = dict(
pattern=base_code,
prefix=adicap_prefix,
- attr="TEXT",
window=500,
+ attr="TEXT",
+ label="adicap",
+ span_setter={"ents": True, "adicap": True},
)
-
-@Language.factory(
+create_component = Language.factory(
"eds.adicap",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str = "eds.adicap",
- pattern: Optional[Union[List[str], str]] = base_code,
- prefix: Optional[Union[List[str], str]] = adicap_prefix,
- window: int = 500,
- attr: str = "TEXT",
-):
- """
- Create a new component to recognize and normalize ADICAP codes in documents.
-
- Parameters
- ----------
- nlp: Language
- spaCy `Language` object.
- name: str
- The name of the pipe
- pattern: Optional[Union[List[str], str]]
- The regex pattern to use for matching ADICAP codes
- prefix: Optional[Union[List[str], str]]
- The regex pattern to use for matching the prefix before ADICAP codes
- window: int
- Number of tokens to look for prefix. It will never go further the start of
- the sentence
- attr: str
- Attribute to match on, eg `TEXT`, `NORM`, etc.
- """
-
- return Adicap(nlp, pattern=pattern, attr=attr, prefix=prefix, window=window)
+)(AdicapMatcher)
diff --git a/edsnlp/pipelines/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipelines/ner/behaviors/alcohol/alcohol.py
index b2e21bda0..de7785fa4 100644
--- a/edsnlp/pipelines/ner/behaviors/alcohol/alcohol.py
+++ b/edsnlp/pipelines/ner/behaviors/alcohol/alcohol.py
@@ -1,28 +1,110 @@
"""`eds.alcohol` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
-
+from ...disorders.base import DisorderMatcher
from .patterns import default_patterns
-class Alcohol(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class AlcoholMatcher(DisorderMatcher):
+ """
+ The `eds.alcohol` pipeline component extracts mentions of alcohol consumption.
+ It won't match occasional consumption, nor acute intoxication.
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/behaviors/alcohol/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to either
+ - `"PRESENT"`
+ - `"ABSTINENCE"` if the patient stopped its consumption
+ - `"ABSENT"` if the patient has no alcohol dependence
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.alcohol")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/alcohol-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : Optional[str]
+ The name of the component
+ patterns : Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.alcohol` component was developed by AP-HP's Data Science team with a team
+ of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.alcohol",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label="alcohol",
+ span_setter={"ents": True, "alcohol": True},
+ ):
super().__init__(
nlp=nlp,
- name="alcohol",
+ name=name,
patterns=patterns,
- detailled_statusmapping={0: "ABSENT", 1: "PRESENT", 2: "ABSTINENCE"},
+ detailed_status_mapping={
+ 0: "ABSENT",
+ 1: "PRESENT",
+ 2: "ABSTINENCE",
+ },
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if "stopped" in span._.assigned.keys():
span._.status = 2
diff --git a/edsnlp/pipelines/ner/behaviors/alcohol/factory.py b/edsnlp/pipelines/ner/behaviors/alcohol/factory.py
index 304017ea1..f0384b50c 100644
--- a/edsnlp/pipelines/ner/behaviors/alcohol/factory.py
+++ b/edsnlp/pipelines/ner/behaviors/alcohol/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .alcohol import Alcohol
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .alcohol import AlcoholMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="alcohol",
+ span_setter={"ents": True, "alcohol": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.alcohol",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Alcohol(nlp, patterns=patterns)
+)(AlcoholMatcher)
diff --git a/edsnlp/pipelines/ner/behaviors/tobacco/factory.py b/edsnlp/pipelines/ner/behaviors/tobacco/factory.py
index ca7b13436..e704ea0de 100644
--- a/edsnlp/pipelines/ner/behaviors/tobacco/factory.py
+++ b/edsnlp/pipelines/ner/behaviors/tobacco/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .tobacco import Tobacco
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .patterns import default_patterns
+from .tobacco import TobaccoMatcher
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="tobacco",
+ span_setter={"ents": True, "tobacco": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.tobacco",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Tobacco(nlp, patterns=patterns)
+)(TobaccoMatcher)
diff --git a/edsnlp/pipelines/ner/behaviors/tobacco/patterns.py b/edsnlp/pipelines/ner/behaviors/tobacco/patterns.py
index 35372f143..681382423 100644
--- a/edsnlp/pipelines/ner/behaviors/tobacco/patterns.py
+++ b/edsnlp/pipelines/ner/behaviors/tobacco/patterns.py
@@ -2,49 +2,51 @@
QUANTITY = r"(?P[\d]{1,3})"
PUNCT = r"\.,-;\(\)"
-default_patterns = dict(
- source="tobacco",
- regex=[
- r"tabagi",
- r"tabac",
- r"\bfume\b",
- r"\bfumeu",
- r"\bpipes?\b",
- ],
- exclude=dict(
+default_patterns = [
+ dict(
+ source="tobacco",
regex=[
- "occasion",
- "moder",
- "quelqu",
- "festi",
- "rare",
- "sujet", # Example : Chez le sujet fumeur ... generic sentences
+ r"tabagi",
+ r"tabac",
+ r"\bfume\b",
+ r"\bfumeu",
+ r"\bpipes?\b",
],
- window=(-3, 5),
- ),
- regex_attr="NORM",
- assign=[
- dict(
- name="stopped",
- regex=r"(? List[Span]:
+ for span in super().process(doc):
if "stopped" in span._.assigned.keys():
span._.status = 2
diff --git a/edsnlp/pipelines/ner/cim10/factory.py b/edsnlp/pipelines/ner/cim10/factory.py
index 985ddef0c..01b3efb74 100644
--- a/edsnlp/pipelines/ner/cim10/factory.py
+++ b/edsnlp/pipelines/ner/cim10/factory.py
@@ -1,67 +1,115 @@
-from typing import Any, Dict, Union
+from typing import Any, Dict
from spacy.language import Language
+from typing_extensions import Literal
-from edsnlp.pipelines.core.terminology import TerminologyMatcher, TerminologyTermMatcher
+from edsnlp.pipelines.core.terminology.terminology import TerminologyMatcher
-from . import patterns
+from ...base import SpanSetterArg
+from .patterns import get_patterns
DEFAULT_CONFIG = dict(
attr="NORM",
ignore_excluded=False,
ignore_space_tokens=False,
- term_matcher=TerminologyTermMatcher.exact,
+ term_matcher="exact",
term_matcher_config={},
+ label="cim10",
+ span_setter={"ents": True, "cim10": True},
)
@Language.factory(
- "eds.cim10", default_config=DEFAULT_CONFIG, assigns=["doc.ents", "doc.spans"]
+ "eds.cim10",
+ assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
name: str = "eds.cim10",
- attr: Union[str, Dict[str, str]] = "NORM",
+ *,
+ attr: str = "NORM",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
- term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
+ term_matcher: Literal["exact", "simstring"] = "exact",
term_matcher_config: Dict[str, Any] = {},
+ label: str = "cim10",
+ span_setter: SpanSetterArg = {"ents": True, "cim10": True},
):
"""
- Create a factory that returns new a component to recognize and normalize CIM10 codes
- and concepts in documents.
+ The `eds.cim10` pipeline component extract terms from documents using the CIM10
+ (French-language ICD) terminology as a reference.
+
+ !!! warning "Very low recall"
+
+ When using the `exact` matching mode, this component has a very poor recall
+ performance. We can use the `simstring` mode to retrieve approximate matches,
+ albeit at the cost of a significantly higher computation time.
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.cim10", config=dict(term_matcher="simstring"))
+
+ text = "Le patient est suivi pour fièvres typhoïde et paratyphoïde."
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (fièvres typhoïde et paratyphoïde,)
+
+ ent = doc.ents[0]
+
+ ent.label_
+ # Out: cim10
+
+ ent.kb_id_
+ # Out: A01
+ ```
Parameters
----------
- nlp: Language
- spaCy `Language` object.
- name: str
- The name of the pipe
- attr: Union[str, Dict[str, str]]
- Attribute to match on, eg `TEXT`, `NORM`, etc.
- ignore_excluded: bool
- Whether to skip excluded tokens during matching.
- ignore_space_tokens: bool
+ nlp : Language
+ The pipeline object
+ name : str
+ The name of the component
+ attr : str
+ The default attribute to use for matching.
+ ignore_excluded : bool
+ Whether to skip excluded tokens (requires an upstream
+ pipeline to mark excluded tokens).
+ ignore_space_tokens : bool
Whether to skip space tokens during matching.
term_matcher: TerminologyTermMatcher
- The term matcher to use, either `TerminologyTermMatcher.exact` or
- `TerminologyTermMatcher.simstring`
- term_matcher_config: Dict[str, Any]
- The configuration for the term matcher
+ The matcher to use for matching phrases ?
+ One of (exact, simstring)
+ term_matcher_config: Dict[str,Any]
+ Parameters of the matcher term matcher
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
Returns
-------
TerminologyMatcher
- """
+ Authors and citation
+ --------------------
+ The `eds.cim10` pipeline was developed by AP-HP's Data Science team.
+ """
return TerminologyMatcher(
- nlp,
- label="cim10",
- regex=None,
- terms=patterns.get_patterns(),
+ nlp=nlp,
+ name=name,
+ regex=dict(),
+ terms=get_patterns(),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/covid/factory.py b/edsnlp/pipelines/ner/covid/factory.py
index b59861232..0ab1f01c8 100644
--- a/edsnlp/pipelines/ner/covid/factory.py
+++ b/edsnlp/pipelines/ner/covid/factory.py
@@ -1,56 +1,91 @@
-from typing import Dict, Union
+from typing import Dict, List, Union
from spacy.language import Language
-from edsnlp.pipelines.core.matcher import GenericMatcher
+from edsnlp.pipelines.core.matcher.matcher import GenericMatcher
-from . import patterns
+from ...base import SpanSetterArg
+from .patterns import patterns
DEFAULT_CONFIG = dict(
attr="LOWER",
ignore_excluded=False,
ignore_space_tokens=False,
+ patterns=patterns,
+ label="covid",
+ span_setter={"ents": True, "covid": True},
)
@Language.factory(
"eds.covid",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
name: str = "eds.covid",
+ *,
attr: Union[str, Dict[str, str]] = "LOWER",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
+ patterns: List[str] = patterns,
+ label: str = "covid",
+ span_setter: SpanSetterArg = {"ents": True, "covid": True},
):
"""
- Create a factory that returns new GenericMatcher with patterns for covid
+ The `eds.covid` pipeline component detects mentions of COVID19.
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.covid")
+
+ text = "Le patient est admis pour une infection au coronavirus."
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (infection au coronavirus,)
+ ```
Parameters
----------
- nlp: Language
+ nlp : Language
spaCy `Language` object.
- name: str
+ name : str
The name of the pipe
- attr: Union[str, Dict[str, str]]
+ attr : Union[str, Dict[str, str]]
Attribute to match on, eg `TEXT`, `NORM`, etc.
- ignore_excluded: bool
+ ignore_excluded : bool
Whether to skip excluded tokens during matching.
- ignore_space_tokens: bool
+ ignore_space_tokens : bool
Whether to skip space tokens during matching.
+ patterns : List[str]
+ The regex pattern to use
+ label : str
+ Label to use for matches
+ span_setter : SpanSetterArg
+ How to set matches on the doc
Returns
-------
GenericMatcher
+
+ Authors and citation
+ --------------------
+ The `eds.covid` pipeline was developed by AP-HP's Data Science team.
"""
return GenericMatcher(
- nlp,
+ nlp=nlp,
+ name=name,
terms=None,
- regex=dict(covid=patterns.pattern),
+ regex={label: patterns},
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/covid/patterns.py b/edsnlp/pipelines/ner/covid/patterns.py
index 0fecdbd4f..2a7756b40 100644
--- a/edsnlp/pipelines/ner/covid/patterns.py
+++ b/edsnlp/pipelines/ner/covid/patterns.py
@@ -6,8 +6,6 @@
r"corona[-\s]?virus",
]
-
diseases = [r"pneumopathies?", r"infections?"]
-
-pattern = r"(" + make_pattern(diseases) + r"\s[àa]u?\s)?" + make_pattern(covid)
+patterns = [r"(" + make_pattern(diseases) + r"\s[àa]u?\s)?" + make_pattern(covid)]
diff --git a/edsnlp/pipelines/ner/disorders/AIDS/AIDS.py b/edsnlp/pipelines/ner/disorders/AIDS/AIDS.py
deleted file mode 100644
index 608dd1596..000000000
--- a/edsnlp/pipelines/ner/disorders/AIDS/AIDS.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""`eds.AIDS` pipeline"""
-import itertools
-from typing import Generator
-
-from spacy.tokens import Doc, Span
-
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
-from edsnlp.pipelines.qualifiers.hypothesis import Hypothesis
-from edsnlp.pipelines.qualifiers.hypothesis.factory import (
- DEFAULT_CONFIG as DEFAULT_CONFIG_HYP,
-)
-from edsnlp.pipelines.qualifiers.negation import Negation
-from edsnlp.pipelines.qualifiers.negation.factory import (
- DEFAULT_CONFIG as DEFAULT_CONFIG_NEG,
-)
-
-from .patterns import default_patterns
-
-
-class AIDS(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
-
- super().__init__(
- nlp=nlp,
- name="AIDS",
- patterns=patterns,
- include_assigned=False,
- )
-
- DEFAULT_CONFIG_NEG.update({"on_ents_only": "AIDS_opportunist"})
- DEFAULT_CONFIG_HYP.update({"on_ents_only": "AIDS_opportunist"})
-
- self.inner_negation = Negation(
- nlp,
- **DEFAULT_CONFIG_NEG,
- )
-
- self.inner_hypothesis = Hypothesis(
- nlp,
- **DEFAULT_CONFIG_HYP,
- )
-
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- spans = list(spans)
-
- doc.spans["AIDS_opportunist"] = list(
- itertools.chain.from_iterable(
- [span._.assigned.get("opportunist", []) for span in spans]
- )
- )
-
- doc = self.inner_negation(
- self.inner_hypothesis(
- doc,
- )
- )
-
- for span in spans:
- opportunists = span._.assigned.get("opportunist", [])
- if opportunists:
- opportunists = [
- ent
- for ent in opportunists
- if not (ent._.negation or ent._.hypothesis)
- ]
- stage = "stage" in span._.assigned
-
- if span._.source == "hiv" and not (opportunists or stage):
- continue
-
- yield span
-
- del doc.spans["AIDS_opportunist"]
diff --git a/edsnlp/pipelines/ner/disorders/AIDS/factory.py b/edsnlp/pipelines/ner/disorders/AIDS/factory.py
deleted file mode 100644
index 04549de2a..000000000
--- a/edsnlp/pipelines/ner/disorders/AIDS/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from typing import Any, Dict, Optional
-
-from spacy.language import Language
-
-from .AIDS import AIDS
-
-DEFAULT_CONFIG = dict(patterns=None)
-
-
-@Language.factory(
- "eds.AIDS",
- default_config=DEFAULT_CONFIG,
- assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return AIDS(nlp, patterns=patterns)
diff --git a/edsnlp/pipelines/ner/disorders/CKD/CKD.py b/edsnlp/pipelines/ner/disorders/CKD/CKD.py
deleted file mode 100644
index bae478fbd..000000000
--- a/edsnlp/pipelines/ner/disorders/CKD/CKD.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""`eds.CKD` pipeline"""
-from typing import Generator, Optional
-
-from loguru import logger
-from spacy.tokens import Doc, Span
-
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
-
-from .patterns import default_patterns
-
-
-class CKD(DisorderMatcher):
- def __init__(self, nlp, patterns):
-
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
-
- super().__init__(
- nlp=nlp,
- name="CKD",
- patterns=patterns,
- )
-
- def classify_from_dfg(self, dfg_span: Optional[Span]):
- if dfg_span is None:
- return False
- try:
- dfg_value = float(dfg_span.text.replace(",", ".").strip())
- except ValueError:
- logger.trace(f"DFG value couldn't be extracted from {dfg_span.text}")
- return False
-
- return dfg_value < 60 # We keep only moderate to severe CKD
-
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
-
- if span._.source == "dialysis" and "chronic" not in span._.assigned.keys():
- continue
-
- if span._.source == "general":
- if {"stage", "status"} & set(span._.assigned.keys()):
- yield span
- continue
- elif self.classify_from_dfg(span._.assigned.get("dfg", None)):
- yield span
- continue
- else:
- continue
-
- yield span
diff --git a/edsnlp/pipelines/ner/disorders/CKD/factory.py b/edsnlp/pipelines/ner/disorders/CKD/factory.py
deleted file mode 100644
index 37f618a29..000000000
--- a/edsnlp/pipelines/ner/disorders/CKD/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from typing import Any, Dict, Optional
-
-from spacy.language import Language
-
-from .CKD import CKD
-
-DEFAULT_CONFIG = dict(patterns=None)
-
-
-@Language.factory(
- "eds.CKD",
- default_config=DEFAULT_CONFIG,
- assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return CKD(nlp, patterns=patterns)
diff --git a/edsnlp/pipelines/ner/disorders/COPD/COPD.py b/edsnlp/pipelines/ner/disorders/COPD/COPD.py
deleted file mode 100644
index 8600d1c05..000000000
--- a/edsnlp/pipelines/ner/disorders/COPD/COPD.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""`eds.COPD` pipeline"""
-from typing import Generator
-
-from spacy.tokens import Doc, Span
-
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
-
-from .patterns import default_patterns
-
-
-class COPD(DisorderMatcher):
- def __init__(self, nlp, patterns):
-
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
-
- super().__init__(
- nlp=nlp,
- name="COPD",
- patterns=patterns,
- )
-
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
-
- if span._.source == "oxygen" and not span._.assigned:
- continue
-
- yield span
diff --git a/edsnlp/pipelines/ner/disorders/COPD/factory.py b/edsnlp/pipelines/ner/disorders/COPD/factory.py
deleted file mode 100644
index 94efdedcf..000000000
--- a/edsnlp/pipelines/ner/disorders/COPD/factory.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from typing import Any, Dict, Optional
-
-from spacy.language import Language
-
-from .COPD import COPD
-
-DEFAULT_CONFIG = dict(patterns=None)
-
-
-@Language.factory(
- "eds.COPD",
- default_config=DEFAULT_CONFIG,
- assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return COPD(nlp, patterns=patterns)
diff --git a/edsnlp/pipelines/ner/disorders/CKD/__init__.py b/edsnlp/pipelines/ner/disorders/aids/__init__.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/CKD/__init__.py
rename to edsnlp/pipelines/ner/disorders/aids/__init__.py
diff --git a/edsnlp/pipelines/ner/disorders/aids/aids.py b/edsnlp/pipelines/ner/disorders/aids/aids.py
new file mode 100644
index 000000000..f7ee82e83
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/aids/aids.py
@@ -0,0 +1,161 @@
+"""`eds.aids` pipeline"""
+import itertools
+from typing import Any, Dict, List, Optional, Union
+
+from spacy.language import Language
+
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.qualifiers.hypothesis import HypothesisQualifier
+from edsnlp.pipelines.qualifiers.hypothesis.factory import (
+ DEFAULT_CONFIG as DEFAULT_CONFIG_HYP,
+)
+from edsnlp.pipelines.qualifiers.negation.factory import (
+ DEFAULT_CONFIG as DEFAULT_CONFIG_NEG,
+)
+from edsnlp.pipelines.qualifiers.negation.negation import NegationQualifier
+
+from .patterns import default_patterns
+
+
+class AIDSMatcher(DisorderMatcher):
+ """
+ The `eds.aids` pipeline component extracts mentions of AIDS. It will notably match:
+
+ - Mentions of VIH/HIV at the SIDA/AIDS stage
+ - Mentions of VIH/HIV with opportunistic(s) infection(s)
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/AIDS/patterns.py"
+ # fmt: on
+ ```
+
+ !!! warning "On HIV infection"
+
+ pre-AIDS HIV infection are not extracted, only AIDS.
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+ - `span._.assigned`: dictionary with the following keys, if relevant:
+ - `opportunist`: list of opportunist infections extracted around the HIV mention
+ - `stage`: stage of the HIV infection
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.aids")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/aids-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.aids` component was developed by AP-HP's Data Science team with a team of
+ medical experts. A paper describing in details the development of those components
+ is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language],
+ name: str = "eds.aids",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "aids",
+ span_setter: SpanSetterArg = {"ents": True, "aids": True},
+ ):
+
+ super().__init__(
+ nlp=nlp,
+ name=name,
+ label=label,
+ patterns=patterns,
+ include_assigned=False,
+ span_setter=span_setter,
+ )
+
+ self.inner_negation = NegationQualifier(
+ nlp,
+ **{
+ **DEFAULT_CONFIG_NEG,
+ "on_ents_only": "AIDS_opportunist",
+ },
+ )
+
+ self.inner_hypothesis = HypothesisQualifier(
+ nlp,
+ **{
+ **DEFAULT_CONFIG_HYP,
+ "on_ents_only": "AIDS_opportunist",
+ },
+ )
+
+ def process(self, doc):
+ spans = list(super().process(doc))
+
+ doc.spans["AIDS_opportunist"] = list(
+ itertools.chain.from_iterable(
+ [span._.assigned.get("opportunist", []) for span in spans]
+ )
+ )
+
+ doc = self.inner_negation(self.inner_hypothesis(doc))
+
+ for span in spans:
+ opportunists = span._.assigned.get("opportunist", [])
+ if opportunists:
+ opportunists = [
+ ent
+ for ent in opportunists
+ if not (ent._.negation or ent._.hypothesis)
+ ]
+ stage = "stage" in span._.assigned
+
+ if span._.source == "hiv" and not (opportunists or stage):
+ continue
+
+ yield span
+
+ del doc.spans["AIDS_opportunist"]
diff --git a/edsnlp/pipelines/ner/disorders/aids/factory.py b/edsnlp/pipelines/ner/disorders/aids/factory.py
new file mode 100644
index 000000000..3349b850a
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/aids/factory.py
@@ -0,0 +1,22 @@
+from spacy import Language
+
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .aids import AIDSMatcher
+from .patterns import default_patterns
+
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="aids",
+ span_setter={"ents": True, "aids": True},
+)
+
+create_component = deprecated_factory(
+ "eds.AIDS",
+ "eds.aids",
+ assigns=["doc.ents", "doc.spans"],
+)(AIDSMatcher)
+create_component = Language.factory(
+ "eds.aids",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/ner/disorders/AIDS/patterns.py b/edsnlp/pipelines/ner/disorders/aids/patterns.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/AIDS/patterns.py
rename to edsnlp/pipelines/ner/disorders/aids/patterns.py
diff --git a/edsnlp/pipelines/ner/disorders/base.py b/edsnlp/pipelines/ner/disorders/base.py
index 78a8d62a6..bb8c4a307 100644
--- a/edsnlp/pipelines/ner/disorders/base.py
+++ b/edsnlp/pipelines/ner/disorders/base.py
@@ -1,10 +1,12 @@
import re
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, List, Union
from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.core.contextual_matcher import ContextualMatcher
+from edsnlp.utils.deprecation import deprecated_getter_factory
from edsnlp.utils.filter import filter_spans
@@ -26,56 +28,64 @@ class DisorderMatcher(ContextualMatcher):
Whether to skip excluded tokens during matching.
ignore_space_tokens: bool
Whether to skip space tokens during matching.
- detailled_statusmapping: Optional[Dict[int, str]]
+ detailed_status_mapping: Dict[int, str]
Mapping from integer status (0, 1 or 2) to human-readable string
alignment_mode : str
Overwrite alignment mode.
regex_flags : Union[re.RegexFlag, int]
RegExp flags to use when matching, filtering and assigning (See
- [here](https://docs.python.org/3/library/re.html#flags))
-
+ the [re docs](https://docs.python.org/3/library/re.html#flags))
"""
def __init__(
self,
nlp: Language,
name: str,
+ *,
+ label: str,
patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
include_assigned: bool = True,
ignore_excluded: bool = True,
ignore_space_tokens: bool = True,
- detailled_statusmapping: Optional[Dict[int, str]] = None,
- ):
- self.nlp = nlp
- self.detailled_statusmapping = detailled_statusmapping or {
+ detailed_status_mapping: Dict[int, str] = {
0: "ABSENT",
1: "PRESENT",
- }
+ },
+ alignment_mode: str = "expand",
+ regex_flags: Union[re.RegexFlag, int] = re.S,
+ span_setter: SpanSetterArg,
+ ):
+ self.nlp = nlp
+ self.detailed_status_mapping = detailed_status_mapping
super().__init__(
nlp=nlp,
name=name,
+ label=label,
attr="NORM",
patterns=patterns,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
- regex_flags=re.S,
- alignment_mode="expand",
+ regex_flags=regex_flags,
+ alignment_mode=alignment_mode,
assign_as_span=True,
include_assigned=include_assigned,
+ span_setter=span_setter,
)
- self.set_extensions()
-
- @classmethod
- def set_extensions(cl) -> None:
+ def set_extensions(self) -> None:
super().set_extensions()
if not Span.has_extension("status"):
Span.set_extension("status", default=1)
+ if not Span.has_extension("detailed_status"):
+ Span.set_extension("detailed_status", default="PRESENT")
if not Span.has_extension("detailled_status"):
- Span.set_extension("detailled_status", default="PRESENT")
+ Span.set_extension(
+ "detailled_status",
+ getter=deprecated_getter_factory("detailed_status", "detailed_status"),
+ )
def __call__(self, doc: Doc) -> Doc:
"""
@@ -91,18 +101,10 @@ def __call__(self, doc: Doc) -> Doc:
doc : Doc
annotated spaCy Doc object
"""
- spans = self.postprocess(doc, self.process(doc))
- spans = filter_spans(spans)
-
+ spans = list(self.process(doc))
for span in spans:
- span._.detailled_status = self.detailled_statusmapping[span._.status]
+ span._.detailed_status = self.detailed_status_mapping[span._.status]
- doc.spans[self.name] = spans
+ self.set_spans(doc, filter_spans(spans))
return doc
-
- def postprocess(self, doc: Doc, spans: Iterable[Span]):
- """
- Can be overrid
- """
- yield from spans
diff --git a/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py b/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py
index ac4f90de2..d10da1c7d 100644
--- a/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py
+++ b/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py
@@ -1,28 +1,109 @@
"""`eds.cerebrovascular_accident` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class CerebrovascularAccident(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class CerebrovascularAccidentMatcher(DisorderMatcher):
+ """
+ The `eds.cerebrovascular_accident` pipeline component extracts mentions of
+ cerebrovascular accident. It will notably match:
+
+ - Mentions of AVC/AIT
+ - Mentions of bleeding, hemorrhage, thrombus, ischemia, etc., localized in the brain
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/cerebrovascular_accident/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Usage
+ -----
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.cerebrovascular_accident")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/cerebrovascular-accident-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.cerebrovascular_accident` component was developed by AP-HP's Data Science
+ team with a team of medical experts. A paper describing in details the development
+ of those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language],
+ name: str = "eds.cerebrovascular_accident",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "cerebrovascular_accident",
+ span_setter: SpanSetterArg = {"ents": True, "cerebrovascular_accident": True},
+ ):
super().__init__(
nlp=nlp,
- name="cerebrovascular_accident",
+ name=name,
+ label=label,
patterns=patterns,
include_assigned=False,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if (span._.source == "with_localization") and (
"brain_localized" not in span._.assigned.keys()
):
diff --git a/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/factory.py b/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/factory.py
index 65315f260..c84fe2dcf 100644
--- a/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/factory.py
+++ b/edsnlp/pipelines/ner/disorders/cerebrovascular_accident/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .cerebrovascular_accident import CerebrovascularAccident
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .cerebrovascular_accident import CerebrovascularAccidentMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="cerebrovascular_accident",
+ span_setter={"ents": True, "cerebrovascular_accident": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.cerebrovascular_accident",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return CerebrovascularAccident(nlp, patterns=patterns)
+)(CerebrovascularAccidentMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/COPD/__init__.py b/edsnlp/pipelines/ner/disorders/ckd/__init__.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/COPD/__init__.py
rename to edsnlp/pipelines/ner/disorders/ckd/__init__.py
diff --git a/edsnlp/pipelines/ner/disorders/ckd/ckd.py b/edsnlp/pipelines/ner/disorders/ckd/ckd.py
new file mode 100644
index 000000000..c66f63f04
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/ckd/ckd.py
@@ -0,0 +1,149 @@
+"""`eds.ckd` pipeline"""
+from typing import Any, Dict, List, Optional, Union
+
+from loguru import logger
+from spacy import Language
+from spacy.tokens import Doc, Span
+
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.utils.deprecation import deprecated_factory
+
+from ..base import DisorderMatcher
+from .patterns import default_patterns
+
+
+@deprecated_factory(
+ "eds.CKD",
+ "eds.ckd",
+ assigns=["doc.ents", "doc.spans"],
+)
+@Language.factory(
+ "eds.ckd",
+ assigns=["doc.ents", "doc.spans"],
+)
+class CKDMatcher(DisorderMatcher):
+ """
+
+ The `eds.CKD` pipeline component extracts mentions of CKD (Chronic Kidney Disease).
+ It will notably match:
+
+ - Mentions of various diseases (see below)
+ - Kidney transplantation
+ - Chronic dialysis
+ - Renal failure **from stage 3 to 5**. The stage is extracted by trying 3 methods:
+ - Extracting the mentioned stage directly ("*IRC stade IV*")
+ - Extracting the severity directly ("*IRC terminale*")
+ - Extracting the mentioned GFR (DFG in french) ("*IRC avec DFG estimé à 30
+ mL/min/1,73m2)*")
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/CKD/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+ - `span._.assigned`: dictionary with the following keys, if relevant:
+ - `stage`: mentioned renal failure stage
+ - `status`: mentioned renal failure severity (e.g. modérée, sévère, terminale,
+ etc.)
+ - `dfg`: mentioned DFG
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.CKD")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/ckd-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.CKD` component was developed by AP-HP's Data Science team with a team of
+ medical experts. A paper describing in details the development of those components
+ is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language],
+ name: str = "eds.ckd",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "ckd",
+ span_setter: SpanSetterArg = {"ents": True, "ckd": True},
+ ):
+
+ super().__init__(
+ nlp=nlp,
+ name=name,
+ label=label,
+ patterns=patterns,
+ span_setter=span_setter,
+ )
+
+ def classify_from_dfg(self, dfg_span: Optional[Span]):
+ if dfg_span is None:
+ return False
+ try:
+ dfg_value = float(dfg_span.text.replace(",", ".").strip())
+ except ValueError:
+ logger.trace(f"DFG value couldn't be extracted from {dfg_span.text}")
+ return False
+
+ return dfg_value < 60 # We keep only moderate to severe CKD
+
+ def process(self, doc: Doc):
+ for span in super().process(doc):
+ if span._.source == "dialysis" and "chronic" not in span._.assigned.keys():
+ continue
+
+ if span._.source == "general":
+ if {"stage", "status"} & set(span._.assigned.keys()):
+ yield span
+ elif self.classify_from_dfg(span._.assigned.get("dfg", None)):
+ yield span
+ else:
+ yield span
diff --git a/edsnlp/pipelines/ner/disorders/ckd/factory.py b/edsnlp/pipelines/ner/disorders/ckd/factory.py
new file mode 100644
index 000000000..8b816ae90
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/ckd/factory.py
@@ -0,0 +1,22 @@
+from spacy import Language
+
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .ckd import CKDMatcher
+from .patterns import default_patterns
+
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="ckd",
+ span_setter={"ents": True, "ckd": True},
+)
+
+create_component = deprecated_factory(
+ "eds.CKD",
+ "eds.ckd",
+ assigns=["doc.ents", "doc.spans"],
+)(CKDMatcher)
+create_component = Language.factory(
+ "eds.ckd",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/ner/disorders/CKD/patterns.py b/edsnlp/pipelines/ner/disorders/ckd/patterns.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/CKD/patterns.py
rename to edsnlp/pipelines/ner/disorders/ckd/patterns.py
diff --git a/edsnlp/pipelines/ner/disorders/congestive_heart_failure/congestive_heart_failure.py b/edsnlp/pipelines/ner/disorders/congestive_heart_failure/congestive_heart_failure.py
index e26bab136..6de90dcc8 100644
--- a/edsnlp/pipelines/ner/disorders/congestive_heart_failure/congestive_heart_failure.py
+++ b/edsnlp/pipelines/ner/disorders/congestive_heart_failure/congestive_heart_failure.py
@@ -1,20 +1,96 @@
"""`eds.congestive_heart_failure` pipeline"""
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class CongestiveHeartFailure(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class CongestiveHeartFailureMatcher(DisorderMatcher):
+ """
+ The `eds.congestive_heart_failure` pipeline component extracts mentions of
+ congestive heart failure. It will notably match:
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ - Mentions of various diseases (see below)
+ - Heart transplantation
+ - AF (Atrial Fibrillation)
+ - Pacemaker
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/congestive_heart_failure/patterns.py"
+ # fmt: on
+ ```
+
+ Usage
+ -----
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.congestive_heart_failure")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/congestive-heart-failure-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : str,
+ The name of the component
+ patterns : Optional[Dict[str, Any]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.congestive_heart_failure` component was developed by AP-HP's Data Science
+ team with a team of medical experts. A paper describing in details the development
+ of those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.congestive_heart_failure",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "congestive_heart_failure",
+ span_setter: SpanSetterArg = {"ents": True, "congestive_heart_failure": True},
+ ):
super().__init__(
nlp=nlp,
- name="congestive_heart_failure",
+ name=name,
+ label=label,
patterns=patterns,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/disorders/congestive_heart_failure/factory.py b/edsnlp/pipelines/ner/disorders/congestive_heart_failure/factory.py
index 449ec1a62..f5a082496 100644
--- a/edsnlp/pipelines/ner/disorders/congestive_heart_failure/factory.py
+++ b/edsnlp/pipelines/ner/disorders/congestive_heart_failure/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .congestive_heart_failure import CongestiveHeartFailure
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .congestive_heart_failure import CongestiveHeartFailureMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="congestive_heart_failure",
+ span_setter={"ents": True, "congestive_heart_failure": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.congestive_heart_failure",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return CongestiveHeartFailure(nlp, patterns=patterns)
+)(CongestiveHeartFailureMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/connective_tissue_disease/connective_tissue_disease.py b/edsnlp/pipelines/ner/disorders/connective_tissue_disease/connective_tissue_disease.py
index 89287f2d0..9c04d6e90 100644
--- a/edsnlp/pipelines/ner/disorders/connective_tissue_disease/connective_tissue_disease.py
+++ b/edsnlp/pipelines/ner/disorders/connective_tissue_disease/connective_tissue_disease.py
@@ -1,28 +1,104 @@
"""`eds.connective_tissue_disease` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class ConnectiveTissueDisease(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class ConnectiveTissueDiseaseMatcher(DisorderMatcher):
+ """
+ The `eds.connective_tissue_disease` pipeline component extracts mentions of
+ connective tissue diseases.
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/connective_tissue_disease/patterns.py"
+ # fmt: on
+ ```
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.connective_tissue_disease")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/connective-tissue-disease-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : str
+ The name of the component
+ patterns : Optional[Dict[str, Any]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.connective_tissue_disease` component was developed by AP-HP's Data Science
+ team with a team of medical experts. A paper describing in details the development
+ of those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.connective_tissue_disease",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "connective_tissue_disease",
+ span_setter: SpanSetterArg = {"ents": True, "connective_tissue_disease": True},
+ ):
super().__init__(
nlp=nlp,
- name="connective_tissue_disease",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if span._.source == "lupus" and all(tok.is_upper for tok in span):
# Huge change of FP / Title section
diff --git a/edsnlp/pipelines/ner/disorders/connective_tissue_disease/factory.py b/edsnlp/pipelines/ner/disorders/connective_tissue_disease/factory.py
index d2d9241aa..7d76d4343 100644
--- a/edsnlp/pipelines/ner/disorders/connective_tissue_disease/factory.py
+++ b/edsnlp/pipelines/ner/disorders/connective_tissue_disease/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .connective_tissue_disease import ConnectiveTissueDisease
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .connective_tissue_disease import ConnectiveTissueDiseaseMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="connective_tissue_disease",
+ span_setter={"ents": True, "connective_tissue_disease": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.connective_tissue_disease",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return ConnectiveTissueDisease(nlp, patterns=patterns)
+)(ConnectiveTissueDiseaseMatcher)
diff --git a/edsnlp/pipelines/ner/scores/elstonellis/__init__.py b/edsnlp/pipelines/ner/disorders/copd/__init__.py
similarity index 100%
rename from edsnlp/pipelines/ner/scores/elstonellis/__init__.py
rename to edsnlp/pipelines/ner/disorders/copd/__init__.py
diff --git a/edsnlp/pipelines/ner/disorders/copd/copd.py b/edsnlp/pipelines/ner/disorders/copd/copd.py
new file mode 100644
index 000000000..a2f794adf
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/copd/copd.py
@@ -0,0 +1,108 @@
+"""`eds.copd` pipeline"""
+from typing import Any, Dict, List, Optional, Union
+
+from spacy import Language
+from spacy.tokens import Doc
+
+from edsnlp.pipelines.base import SpanSetterArg
+
+from ..base import DisorderMatcher
+from .patterns import default_patterns
+
+
+class COPDMatcher(DisorderMatcher):
+ """
+ The `eds.copd` pipeline component extracts mentions of COPD (*Chronic obstructive
+ pulmonary disease*). It will notably match:
+
+ - Mentions of various diseases (see below)
+ - Pulmonary hypertension
+ - Long-term oxygen therapy
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/COPD/patterns.py"
+ # fmt: on
+ ```
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.copd")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/copd-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.copd` component was developed by AP-HP's Data Science team with a team of
+ medical experts. A paper describing in details the development of those components
+ is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.copd",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "copd",
+ span_setter: SpanSetterArg = {"ents": True, "copd": True},
+ ):
+ super().__init__(
+ nlp=nlp,
+ name=name,
+ patterns=patterns,
+ label=label,
+ span_setter=span_setter,
+ )
+
+ def process(self, doc: Doc):
+ for span in super().process(doc):
+ if span._.source == "oxygen" and not span._.assigned:
+ continue
+
+ yield span
diff --git a/edsnlp/pipelines/ner/disorders/copd/factory.py b/edsnlp/pipelines/ner/disorders/copd/factory.py
new file mode 100644
index 000000000..1fda6fbb0
--- /dev/null
+++ b/edsnlp/pipelines/ner/disorders/copd/factory.py
@@ -0,0 +1,22 @@
+from spacy import Language
+
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .copd import COPDMatcher
+from .patterns import default_patterns
+
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="copd",
+ span_setter={"ents": True, "copd": True},
+)
+
+create_component = deprecated_factory(
+ "eds.COPD",
+ "eds.copd",
+ assigns=["doc.ents", "doc.spans"],
+)(COPDMatcher)
+create_component = Language.factory(
+ "eds.copd",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/ner/disorders/COPD/patterns.py b/edsnlp/pipelines/ner/disorders/copd/patterns.py
similarity index 100%
rename from edsnlp/pipelines/ner/disorders/COPD/patterns.py
rename to edsnlp/pipelines/ner/disorders/copd/patterns.py
diff --git a/edsnlp/pipelines/ner/disorders/dementia/dementia.py b/edsnlp/pipelines/ner/disorders/dementia/dementia.py
index efceebcd4..5fec3b913 100644
--- a/edsnlp/pipelines/ner/disorders/dementia/dementia.py
+++ b/edsnlp/pipelines/ner/disorders/dementia/dementia.py
@@ -1,20 +1,97 @@
"""`eds.dementia` pipeline"""
+from typing import Any, Dict, List, Optional, Union
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from spacy import Language
+from edsnlp.pipelines.base import SpanSetterArg
+
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class Dementia(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class DementiaMatcher(DisorderMatcher):
+ """
+ The `eds.dementia` pipeline component extracts mentions of dementia.
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/dementia/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.dementia")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/dementia-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ The span setter to use
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ Authors and citation
+ --------------------
+ The `eds.dementia` component was developed by AP-HP's Data Science team with a team
+ of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.dementia",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "dementia",
+ span_setter: SpanSetterArg = {"ents": True, "dementia": True},
+ ):
super().__init__(
nlp=nlp,
- name="dementia",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/disorders/dementia/factory.py b/edsnlp/pipelines/ner/disorders/dementia/factory.py
index 66d9ed606..4cd39bfdf 100644
--- a/edsnlp/pipelines/ner/disorders/dementia/factory.py
+++ b/edsnlp/pipelines/ner/disorders/dementia/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .dementia import Dementia
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .dementia import DementiaMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="dementia",
+ span_setter={"ents": True, "dementia": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.dementia",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Dementia(nlp, patterns=patterns)
+)(DementiaMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/diabetes/diabetes.py b/edsnlp/pipelines/ner/disorders/diabetes/diabetes.py
index c0b3bfb45..46240967a 100644
--- a/edsnlp/pipelines/ner/disorders/diabetes/diabetes.py
+++ b/edsnlp/pipelines/ner/disorders/diabetes/diabetes.py
@@ -1,31 +1,116 @@
"""`eds.diabetes` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
from edsnlp.matchers.regex import RegexMatcher
from edsnlp.matchers.utils import get_text
-from edsnlp.pipelines.core.contextual_matcher.contextual_matcher import get_window
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.core.contextual_matcher.contextual_matcher import (
+ get_window,
+)
+from ..base import DisorderMatcher
from .patterns import COMPLICATIONS, default_patterns
-class Diabetes(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class DiabetesMatcher(DisorderMatcher):
+ """
+ The `eds.diabetes` pipeline component extracts mentions of diabetes.
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/diabetes/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to either
+ - `"WITH_COMPLICATION"` if the diabetes is complicated (e.g., via organ
+ damages)
+ - `"WITHOUT_COMPLICATION"` otherwise
+ - `span._.assigned`: dictionary with the following keys, if relevant:
+ - `type`: type of diabetes (I or II)
+ - `insulin`: if the diabetes is insulin-dependent
+ - `corticoid`: if the diabetes is corticoid-induced
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.diabetes")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/diabetes-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ The span setter to use
+
+ # Authors and citation
+
+ The `eds.diabetes` component was developed by AP-HP's Data Science team with a team
+ of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.diabetes",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "diabetes",
+ span_setter: SpanSetterArg = {"ents": True, "diabetes": True},
+ ):
super().__init__(
nlp=nlp,
- name="diabetes",
+ name=name,
patterns=patterns,
- detailled_statusmapping={
+ detailed_status_mapping={
0: "ABSENT",
1: "WITHOUT_COMPLICATION",
2: "WITH_COMPLICATION",
},
+ label=label,
+ span_setter=span_setter,
)
self.complication_matcher = RegexMatcher(
@@ -35,8 +120,8 @@ def __init__(self, nlp, patterns):
regex=dict(far_complications=COMPLICATIONS)
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if span._.source == "complicated":
span._.status = 2
@@ -61,6 +146,6 @@ def has_far_complications(self, span: Span):
"""
window = (0, 50)
context = get_window(span, window, limit_to_sentence=False)
- if next(self.complication_matcher(context), None) is not None:
+ if next(iter(self.complication_matcher(context)), None) is not None:
return True
return False
diff --git a/edsnlp/pipelines/ner/disorders/diabetes/factory.py b/edsnlp/pipelines/ner/disorders/diabetes/factory.py
index 2ed32155a..32c38b614 100644
--- a/edsnlp/pipelines/ner/disorders/diabetes/factory.py
+++ b/edsnlp/pipelines/ner/disorders/diabetes/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .diabetes import Diabetes
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .diabetes import DiabetesMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="diabetes",
+ span_setter={"ents": True, "diabetes": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.diabetes",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Diabetes(nlp, patterns=patterns)
+)(DiabetesMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/diabetes/patterns.py b/edsnlp/pipelines/ner/disorders/diabetes/patterns.py
index a9854aaca..b51ad1a89 100644
--- a/edsnlp/pipelines/ner/disorders/diabetes/patterns.py
+++ b/edsnlp/pipelines/ner/disorders/diabetes/patterns.py
@@ -54,7 +54,7 @@
window=6,
),
dict(
- name="cortico",
+ name="corticoid",
regex=r"(bctc\b|cortico(?:.?induit)?)",
window=6,
),
diff --git a/edsnlp/pipelines/ner/disorders/hemiplegia/factory.py b/edsnlp/pipelines/ner/disorders/hemiplegia/factory.py
index 3e1141fae..2f189c6ff 100644
--- a/edsnlp/pipelines/ner/disorders/hemiplegia/factory.py
+++ b/edsnlp/pipelines/ner/disorders/hemiplegia/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .hemiplegia import Hemiplegia
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .hemiplegia import HemiplegiaMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="hemiplegia",
+ span_setter={"ents": True, "hemiplegia": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.hemiplegia",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Hemiplegia(nlp, patterns=patterns)
+)(HemiplegiaMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/hemiplegia/hemiplegia.py b/edsnlp/pipelines/ner/disorders/hemiplegia/hemiplegia.py
index 07be83bb0..96a1a0058 100644
--- a/edsnlp/pipelines/ner/disorders/hemiplegia/hemiplegia.py
+++ b/edsnlp/pipelines/ner/disorders/hemiplegia/hemiplegia.py
@@ -1,20 +1,96 @@
"""`eds.hemiplegia` pipeline"""
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class Hemiplegia(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class HemiplegiaMatcher(DisorderMatcher):
+ """
+ The `eds.hemiplegia` pipeline component extracts mentions of hemiplegia.
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/hemiplegia/patterns.py"
+ # fmt: on
+ ```
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.hemiplegia")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/hemiplegia-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ # Authors and citation
+
+ The `eds.hemiplegia` component was developed by AP-HP's Data Science team with a
+ team of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.hemiplegia",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "hemiplegia",
+ span_setter: SpanSetterArg = {"ents": True, "hemiplegia": True},
+ ):
super().__init__(
nlp=nlp,
- name="hemiplegia",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/disorders/leukemia/factory.py b/edsnlp/pipelines/ner/disorders/leukemia/factory.py
index 888615485..c1954fc0b 100644
--- a/edsnlp/pipelines/ner/disorders/leukemia/factory.py
+++ b/edsnlp/pipelines/ner/disorders/leukemia/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .leukemia import Leukemia
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .leukemia import LeukemiaMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="leukemia",
+ span_setter={"ents": True, "leukemia": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.leukemia",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Leukemia(nlp, patterns=patterns)
+)(LeukemiaMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/leukemia/leukemia.py b/edsnlp/pipelines/ner/disorders/leukemia/leukemia.py
index e8cec989a..9ba239d51 100644
--- a/edsnlp/pipelines/ner/disorders/leukemia/leukemia.py
+++ b/edsnlp/pipelines/ner/disorders/leukemia/leukemia.py
@@ -1,18 +1,96 @@
"""`eds.leukemia` pipeline"""
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
+
+from edsnlp.pipelines.base import SpanSetterArg
+
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class Leukemia(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class LeukemiaMatcher(DisorderMatcher):
+ """
+ The `eds.leukemia` pipeline component extracts mentions of leukemia.
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/leukemia/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.leukemia")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/leukemia-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ Authors and citation
+ --------------------
+ The `eds.leukemia` component was developed by AP-HP's Data Science team with a team
+ of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.leukemia",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "leukemia",
+ span_setter: SpanSetterArg = {"ents": True, "leukemia": True},
+ ):
super().__init__(
nlp=nlp,
- name="leukemia",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/disorders/liver_disease/factory.py b/edsnlp/pipelines/ner/disorders/liver_disease/factory.py
index 403964b8a..3b7fd97b8 100644
--- a/edsnlp/pipelines/ner/disorders/liver_disease/factory.py
+++ b/edsnlp/pipelines/ner/disorders/liver_disease/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .liver_disease import LiverDisease
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .liver_disease import LiverDiseaseMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="liver_disease",
+ span_setter={"ents": True, "liver_disease": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.liver_disease",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return LiverDisease(nlp, patterns=patterns)
+)(LiverDiseaseMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/liver_disease/liver_disease.py b/edsnlp/pipelines/ner/disorders/liver_disease/liver_disease.py
index 1fbff9e79..8f7e3a0f7 100644
--- a/edsnlp/pipelines/ner/disorders/liver_disease/liver_disease.py
+++ b/edsnlp/pipelines/ner/disorders/liver_disease/liver_disease.py
@@ -1,32 +1,110 @@
"""`eds.liver_disease` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class LiverDisease(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class LiverDiseaseMatcher(DisorderMatcher):
+ """
+ The `eds.liver_disease` pipeline component extracts mentions of liver disease.
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/liver_disease/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to either
+ - `"MILD"` for mild liver diseases
+ - `"MODERATE_TO_SEVERE"` else
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.liver_disease")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/liver-disease-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.liver_disease` component was developed by AP-HP's Data Science team with a
+ team of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.liver_disease",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "liver_disease",
+ span_setter: SpanSetterArg = {"ents": True, "liver_disease": True},
+ ):
super().__init__(
nlp=nlp,
- name="liver_disease",
+ name=name,
patterns=patterns,
- detailled_statusmapping={
+ detailed_status_mapping={
0: "ABSENT",
1: "MILD",
2: "MODERATE_TO_SEVERE",
},
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if span._.source in {"moderate_severe", "transplant"}:
span._.status = 2
diff --git a/edsnlp/pipelines/ner/disorders/lymphoma/factory.py b/edsnlp/pipelines/ner/disorders/lymphoma/factory.py
index 430986cd9..1b128288a 100644
--- a/edsnlp/pipelines/ner/disorders/lymphoma/factory.py
+++ b/edsnlp/pipelines/ner/disorders/lymphoma/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .lymphoma import Lymphoma
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .lymphoma import LymphomaMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="lymphoma",
+ span_setter={"ents": True, "lymphoma": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.lymphoma",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return Lymphoma(nlp, patterns=patterns)
+)(LymphomaMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/lymphoma/lymphoma.py b/edsnlp/pipelines/ner/disorders/lymphoma/lymphoma.py
index 0013d146e..90ce000b2 100644
--- a/edsnlp/pipelines/ner/disorders/lymphoma/lymphoma.py
+++ b/edsnlp/pipelines/ner/disorders/lymphoma/lymphoma.py
@@ -1,18 +1,100 @@
"""`eds.lymphoma` pipeline"""
+from typing import Any, Dict, List, Optional, Union
+
+from spacy import Language
+
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
from .patterns import default_patterns
-class Lymphoma(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class LymphomaMatcher(DisorderMatcher):
+ """
+ The `eds.lymphoma` pipeline component extracts mentions of lymphoma.
+
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/lymphoma/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ !!! warning "Monoclonal gammapathy"
+
+ Monoclonal gammapathies are not extracted by this pipeline
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.lymphoma")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/lymphoma-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ Authors and citation
+ --------------------
+ The `eds.lymphoma` component was developed by AP-HP's Data Science team with a team
+ of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.lymphoma",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "lymphoma",
+ span_setter: SpanSetterArg = {"ents": True, "lymphoma": True},
+ ):
super().__init__(
nlp=nlp,
- name="lymphoma",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/disorders/myocardial_infarction/factory.py b/edsnlp/pipelines/ner/disorders/myocardial_infarction/factory.py
index 2b59fb32b..528d86d36 100644
--- a/edsnlp/pipelines/ner/disorders/myocardial_infarction/factory.py
+++ b/edsnlp/pipelines/ner/disorders/myocardial_infarction/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .myocardial_infarction import MyocardialInfarction
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .myocardial_infarction import MyocardialInfarctionMatcher
+from .patterns import default_patterns
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="myocardial_infarction",
+ span_setter={"ents": True, "myocardial_infarction": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.myocardial_infarction",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return MyocardialInfarction(nlp, patterns=patterns)
+)(MyocardialInfarctionMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/myocardial_infarction/myocardial_infarction.py b/edsnlp/pipelines/ner/disorders/myocardial_infarction/myocardial_infarction.py
index c810f80cf..75d38be45 100644
--- a/edsnlp/pipelines/ner/disorders/myocardial_infarction/myocardial_infarction.py
+++ b/edsnlp/pipelines/ner/disorders/myocardial_infarction/myocardial_infarction.py
@@ -1,28 +1,109 @@
"""`eds.myocardial_infarction` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
from .patterns import default_patterns
-class MyocardialInfarction(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class MyocardialInfarctionMatcher(DisorderMatcher):
+ """
+ The `eds.myocardial_infarction` pipeline component extracts mentions of myocardial
+ infarction. It will notably match:
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ - Mentions of various diseases (see below)
+ - Mentions of stents with a heart localization
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/myocardial_infarction/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+ - `span._.assigned`: dictionary with the following keys, if relevant:
+ - `heart_localized`: localization of the stent or bypass
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.myocardial_infarction")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/myocardial-infarction-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.myocardial_infarction` component was developed by AP-HP's Data Science
+ team with a team of medical experts. A paper describing in details the development
+ of those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.myocardial_infarction",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "myocardial_infarction",
+ span_setter: SpanSetterArg = {"ents": True, "myocardial_infarction": True},
+ ):
super().__init__(
nlp=nlp,
- name="myocardial_infarction",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if (
span._.source == "with_localization"
diff --git a/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/factory.py b/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/factory.py
index 7f06cc4f7..50333cc5a 100644
--- a/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/factory.py
+++ b/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .peptic_ulcer_disease import PepticUlcerDisease
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .patterns import default_patterns
+from .peptic_ulcer_disease import PepticUlcerDiseaseMatcher
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="peptic_ulcer_disease",
+ span_setter={"ents": True, "peptic_ulcer_disease": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.peptic_ulcer_disease",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return PepticUlcerDisease(nlp, patterns=patterns)
+)(PepticUlcerDiseaseMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py b/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py
index e131db5a5..c69fa4dbe 100644
--- a/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py
+++ b/edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py
@@ -1,31 +1,105 @@
"""`eds.peptic_ulcer_disease` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
from .patterns import default_patterns
-class PepticUlcerDisease(DisorderMatcher):
- def __init__(self, nlp, patterns):
+class PepticUlcerDiseaseMatcher(DisorderMatcher):
+ """
+ The `eds.peptic_ulcer_disease` pipeline component extracts mentions of peptic ulcer
+ disease.
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/peptic_ulcer_disease/patterns.py"
+ # fmt: on
+ ```
+ Extensions
+ ----------
+ On each span `span` that matches, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.peptic_ulcer_disease")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/peptic-ulcer-disease-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The `eds.peptic_ulcer_disease` component was developed by AP-HP's Data Science team
+ with a team of medical experts. A paper describing in details the development of
+ those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.peptic_ulcer_disease",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "peptic_ulcer_disease",
+ span_setter: SpanSetterArg = {"ents": True, "peptic_ulcer_disease": True},
+ ):
super().__init__(
nlp=nlp,
- name="peptic_ulcer_disease",
+ name=name,
patterns=patterns,
- include_assigned=False,
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
-
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if (span._.source == "generic") and not span._.assigned:
continue
diff --git a/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/factory.py b/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/factory.py
index edb1e7753..28512bff7 100644
--- a/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/factory.py
+++ b/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/factory.py
@@ -1,20 +1,15 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .peripheral_vascular_disease import PeripheralVascularDisease
-
-DEFAULT_CONFIG = dict(patterns=None)
+from .patterns import default_patterns
+from .peripheral_vascular_disease import PeripheralVascularDiseaseMatcher
+DEFAULT_CONFIG = dict(
+ patterns=default_patterns,
+ label="peripheral_vascular_disease",
+ span_setter={"ents": True, "peripheral_vascular_disease": True},
+)
-@Language.factory(
+create_component = Language.factory(
"eds.peripheral_vascular_disease",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
-):
- return PeripheralVascularDisease(nlp, patterns=patterns)
+)(PeripheralVascularDiseaseMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py b/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py
index 3eb2a0848..49292b46f 100644
--- a/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py
+++ b/edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py
@@ -1,27 +1,108 @@
"""`eds.peripheral_vascular_disease` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
from .patterns import default_patterns
-class PeripheralVascularDisease(DisorderMatcher):
- def __init__(self, nlp, patterns):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class PeripheralVascularDiseaseMatcher(DisorderMatcher):
+ """
+ The `eds.peripheral_vascular_disease` pipeline component extracts mentions of
+ peripheral vascular disease.
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/peripheral_vascular_disease/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to `"PRESENT"`
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.peripheral_vascular_disease")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/peripheral-vascular-disease-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+
+ The `eds.peripheral_vascular_disease` component was developed by AP-HP's Data
+ Science team with a team of medical experts. A paper describing in details the
+ development of those components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.peripheral_vascular_disease",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ label: str = "peripheral_vascular_disease",
+ span_setter: SpanSetterArg = {
+ "ents": True,
+ "peripheral_vascular_disease": True,
+ },
+ ):
super().__init__(
nlp=nlp,
- name="peripheral_vascular_disease",
+ name=name,
patterns=patterns,
+ label=label,
+ span_setter=span_setter,
)
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if span._.source == "ischemia":
if "peripheral" not in span._.assigned.keys():
continue
diff --git a/edsnlp/pipelines/ner/disorders/solid_tumor/factory.py b/edsnlp/pipelines/ner/disorders/solid_tumor/factory.py
index 20472c3ac..e093d3c8f 100644
--- a/edsnlp/pipelines/ner/disorders/solid_tumor/factory.py
+++ b/edsnlp/pipelines/ner/disorders/solid_tumor/factory.py
@@ -1,24 +1,16 @@
-from typing import Any, Dict, Optional
+from spacy import Language
-from spacy.language import Language
-
-from .solid_tumor import SolidTumor
+from .patterns import default_patterns
+from .solid_tumor import SolidTumorMatcher
DEFAULT_CONFIG = dict(
- patterns=None,
+ patterns=default_patterns,
use_tnm=False,
+ label="solid_tumor",
+ span_setter={"ents": True, "solid_tumor": True},
)
-
-@Language.factory(
+create_component = Language.factory(
"eds.solid_tumor",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- patterns: Optional[Dict[str, Any]],
- use_tnm: bool,
-):
- return SolidTumor(nlp, patterns=patterns, use_tnm=use_tnm)
+)(SolidTumorMatcher)
diff --git a/edsnlp/pipelines/ner/disorders/solid_tumor/solid_tumor.py b/edsnlp/pipelines/ner/disorders/solid_tumor/solid_tumor.py
index 13cc6091c..8ab8407ea 100644
--- a/edsnlp/pipelines/ner/disorders/solid_tumor/solid_tumor.py
+++ b/edsnlp/pipelines/ner/disorders/solid_tumor/solid_tumor.py
@@ -1,35 +1,120 @@
"""`eds.solid_tumor` pipeline"""
-from typing import Generator
+from typing import Any, Dict, List, Optional, Union
+from spacy import Language
from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.disorders.base import DisorderMatcher
-from edsnlp.pipelines.ner.scores.tnm import TNM
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.utils.numbers import parse_digit
+from ..base import DisorderMatcher
from .patterns import default_patterns
-class SolidTumor(DisorderMatcher):
- def __init__(self, nlp, patterns, use_tnm):
- self.nlp = nlp
- if patterns is None:
- patterns = default_patterns
+class SolidTumorMatcher(DisorderMatcher):
+ """
+ The `eds.solid_tumor` pipeline component extracts mentions of solid tumors. It will
+ notably match:
+ ??? info "Details of the used patterns"
+ ```{ .python .no-check }
+ # fmt: off
+ --8<-- "edsnlp/pipelines/ner/disorders/solid_tumor/patterns.py"
+ # fmt: on
+ ```
+
+ Extensions
+ ----------
+ On each span `span` that match, the following attributes are available:
+
+ - `span._.detailed_status`: set to either
+ - `"METASTASIS"` for tumors at the metastatic stage
+ - `"LOCALIZED"` else
+ - `span._.assigned`: dictionary with the following keys, if relevant:
+ - `stage`: stage of the tumor
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe(
+ "eds.normalizer",
+ config=dict(
+ accents=True,
+ lowercase=True,
+ quotes=True,
+ spaces=True,
+ pollution=dict(
+ information=True,
+ bars=True,
+ biology=True,
+ doctors=True,
+ web=True,
+ coding=True,
+ footer=True,
+ ),
+ ),
+ )
+ nlp.add_pipe(f"eds.solid_tumor")
+ ```
+
+ Below are a few examples:
+
+ --8<-- "docs/assets/fragments/solid-tumor-examples.md"
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline
+ name : Optional[str]
+ The name of the component
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]]
+ The patterns to use for matching
+ label : str
+ The label to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+ use_tnm : bool
+ Whether to use TNM scores matching as well
+
+ Authors and citation
+ --------------------
+ The `eds.solid_tumor` component was developed by AP-HP's Data Science team with a
+ team of medical experts. A paper describing in details the development of those
+ components is being drafted and will soon be available.
+ """
+
+ def __init__(
+ self,
+ nlp: Optional[Language] = None,
+ name: str = "eds.solid_tumor",
+ *,
+ patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns,
+ use_tnm: bool = False,
+ label: str = "solid_tumor",
+ span_setter: SpanSetterArg = {"ents": True, "solid_tumor": True},
+ ):
super().__init__(
nlp=nlp,
- name="solid_tumor",
+ name=name,
patterns=patterns,
- detailled_statusmapping={
+ detailed_status_mapping={
0: "ABSENT",
1: "LOCALIZED",
2: "METASTASIS",
},
+ label=label,
+ span_setter=span_setter,
)
self.use_tnm = use_tnm
if use_tnm:
+ from edsnlp.pipelines.ner.tnm import TNM
+
self.tnm = TNM(nlp, pattern=None, attr="TEXT")
def process_tnm(self, doc):
@@ -44,8 +129,8 @@ def process_tnm(self, doc):
span._.status = 2
yield span
- def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
- for span in spans:
+ def process(self, doc: Doc) -> List[Span]:
+ for span in super().process(doc):
if (span._.source == "metastasis") or (
"metastasis" in span._.assigned.keys()
):
@@ -54,7 +139,7 @@ def postprocess(self, doc: Doc, spans: Generator[Span, None, None]):
if "stage" in span._.assigned.keys():
stage = parse_digit(
span._.assigned["stage"],
- atttr="NORM",
+ attr="NORM",
ignore_excluded=True,
)
if stage == 4:
diff --git a/edsnlp/pipelines/ner/drugs/factory.py b/edsnlp/pipelines/ner/drugs/factory.py
index f55656400..21e66942d 100644
--- a/edsnlp/pipelines/ner/drugs/factory.py
+++ b/edsnlp/pipelines/ner/drugs/factory.py
@@ -1,69 +1,122 @@
from typing import Any, Dict
from spacy.language import Language
+from typing_extensions import Literal
-from edsnlp.pipelines.core.terminology import TerminologyMatcher, TerminologyTermMatcher
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.core.terminology.terminology import TerminologyMatcher
-from . import patterns
+from .patterns import get_patterns
DEFAULT_CONFIG = dict(
attr="NORM",
ignore_excluded=False,
ignore_space_tokens=False,
- term_matcher=TerminologyTermMatcher.exact,
+ term_matcher="exact",
term_matcher_config={},
+ label="drug",
+ span_setter={"ents": True, "drug": True},
)
@Language.factory(
"eds.drugs",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
name: str = "eds.drugs",
+ *,
attr: str = "NORM",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
- term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
+ term_matcher: Literal["exact", "simstring"] = "exact",
term_matcher_config: Dict[str, Any] = {},
+ label: str = "drug",
+ span_setter: SpanSetterArg = {"ents": True, "drug": True},
):
"""
- Create a new component to recognize and normalize drugs in documents.
- The terminology is based on Romedi (see documentation) and the
- drugs are normalized to the ATC codes.
+ The `eds.drugs` pipeline component detects mentions of French drugs (brand names and
+ active ingredients) and adds them to `doc.ents`. Each drug is mapped to an
+ [ATC](https://enwp.org/?curid=2770) code through the Romedi terminology
+ ([@cossin:hal-02987843]). The ATC classifies drugs into groups.
+
+ Examples
+ --------
+ In this example, we are looking for an oral antidiabetic medication
+ (ATC code: A10B).
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.drugs", config=dict(term_matcher="exact"))
+
+ text = "Traitement habituel: Kardégic, cardensiel (bisoprolol), glucophage, lasilix"
+
+ doc = nlp(text)
+
+ drugs_detected = [(x.text, x.kb_id_) for x in doc.ents]
+
+ drugs_detected[0]
+ # Out: ('Kardégic', 'B01AC06')
+
+ len(drugs_detected)
+ # Out: 5
+
+ oral_antidiabetics_detected = list(
+ filter(lambda x: (x[1].startswith("A10B")), drugs_detected)
+ )
+ oral_antidiabetics_detected
+ # Out: [('glucophage', 'A10BA02')]
+ ```
+
+ Glucophage is the brand name of a medication that contains metformine, the
+ first-line medication for the treatment of type 2 diabetes.
Parameters
----------
- nlp: Language
- spaCy `Language` object.
- name: str
- The name of the pipe
- attr: Union[str, Dict[str, str]]
- Attribute to match on, eg `TEXT`, `NORM`, etc.
- ignore_excluded: bool
- Whether to skip excluded tokens during matching.
- ignore_space_tokens: bool
+ nlp : Language
+ The pipeline object
+ name : str
+ The name of the component
+ attr : str
+ The default attribute to use for matching.
+ ignore_excluded : bool
+ Whether to skip excluded tokens (requires an upstream
+ pipeline to mark excluded tokens).
+ ignore_space_tokens : bool
Whether to skip space tokens during matching.
- term_matcher: TerminologyTermMatcher
- The term matcher to use, either `TerminologyTermMatcher.exact` or
- `TerminologyTermMatcher.simstring`
- term_matcher_config: Dict[str, Any]
- The configuration for the term matcher
+ term_matcher: Literal["exact", "simstring"]
+ The matcher to use for matching phrases ?
+ One of (exact, simstring)
+ term_matcher_config: Dict[str,Any]
+ Parameters of the matcher term matcher
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
Returns
-------
TerminologyMatcher
+
+ # Authors and citation
+
+ The `eds.drugs` pipeline was developed by the IAM team and CHU de Bordeaux's Data
+ Science team.
"""
return TerminologyMatcher(
- nlp,
- label="drug",
- terms=patterns.get_patterns(),
+ nlp=nlp,
+ name=name,
regex=dict(),
+ terms=get_patterns(),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/__init__.py b/edsnlp/pipelines/ner/scores/__init__.py
index f492701da..4619c3e5f 100644
--- a/edsnlp/pipelines/ner/scores/__init__.py
+++ b/edsnlp/pipelines/ner/scores/__init__.py
@@ -1,7 +1,3 @@
-from edsnlp.pipelines.ner.scores.base_score import Score
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
-from . import factory
-from .charlson import factory as charlson_factory
-from .elstonellis import factory as elstonellis_factory
-from .emergency.ccmu import factory as emergecy_ccmu_factory
-from .sofa import factory as sofa_factory
+Score = SimpleScoreMatcher
diff --git a/edsnlp/pipelines/ner/scores/base_score.py b/edsnlp/pipelines/ner/scores/base_score.py
index 451dfc6c9..f8fb64d52 100644
--- a/edsnlp/pipelines/ner/scores/base_score.py
+++ b/edsnlp/pipelines/ner/scores/base_score.py
@@ -1,59 +1,84 @@
import re
-from typing import Any, Callable, Dict, List, Union
+import warnings
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
from spacy import registry
from spacy.language import Language
from spacy.tokens import Doc, Span
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.pipelines.core.contextual_matcher import ContextualMatcher
-from edsnlp.utils.filter import filter_spans
-class Score(ContextualMatcher):
- """Matcher component to extract a numeric score"""
+class SimpleScoreMatcher(ContextualMatcher):
+ """
+ Matcher component to extract a numeric score
+
+ Parameters
+ ----------
+ nlp : Language
+ The pipeline object
+ label : str
+ The name of the extracted score
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value else
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ score_name: str
+ Deprecated, use `label` instead. The name of the extracted score
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: Optional[SpanSetterArg]
+ How to set matches on the doc
+ """
def __init__(
self,
nlp: Language,
- score_name: str,
- regex: List[str],
- attr: str,
- value_extract: Union[str, Dict[str, str], List[Dict[str, str]]],
- score_normalization: Union[str, Callable[[Union[str, None]], Any]],
- window: int,
- ignore_excluded: bool,
- ignore_space_tokens: bool,
- flags: Union[re.RegexFlag, int],
+ name: str,
+ *,
+ regex: List[str] = None,
+ attr: str = "NORM",
+ value_extract: Union[str, Dict[str, str], List[Dict[str, str]]] = None,
+ score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None,
+ window: int = 7,
+ ignore_excluded: bool = False,
+ ignore_space_tokens: bool = False,
+ flags: Union[re.RegexFlag, int] = 0,
+ score_name: str = None,
+ label: str = None,
+ span_setter: Optional[SpanSetterArg] = None,
):
- """
- Parameters
- ----------
- nlp : Language
- The spaCy object.
- score_name : str
- The name of the extracted score
- regex : List[str]
- A list of regexes to identify the score
- attr : str
- Whether to match on the text ('TEXT') or on the normalized text ('NORM')
- value_extract : str
- Regex with capturing group to get the score value
- score_normalization : Callable[[Union[str,None]], Any]
- Function that takes the "raw" value extracted from the `value_extract`
- regex and should return:
-
- - None if no score could be extracted
- - The desired score value else
- window : int
- Number of token to include after the score's mention to find the
- score's value
- ignore_excluded : bool
- Whether to ignore excluded spans when matching
- ignore_space_tokens : bool
- Whether to ignore space tokens when matching
- flags : Union[re.RegexFlag, int]
- Regex flags to use when matching
- """
+ if score_name is not None:
+ warnings.warn(
+ "`score_name` is deprecated, use `label` instead.",
+ DeprecationWarning,
+ )
+ label = score_name
+
+ if label is None:
+ raise ValueError("`label` parameter is required.")
+
+ if span_setter is None:
+ span_setter = {"ents": True, label: True}
+
if isinstance(value_extract, str):
value_extract = dict(
name="value",
@@ -76,14 +101,14 @@ def __init__(
assert value_exists, "You should provide a `value` regex in the `assign` dict."
patterns = dict(
- source=score_name,
+ source=label,
regex=regex,
assign=value_extract,
)
super().__init__(
nlp=nlp,
- name=score_name,
+ name=name,
patterns=patterns,
assign_as_span=False,
alignment_mode="expand",
@@ -92,78 +117,46 @@ def __init__(
attr=attr,
regex_flags=flags,
include_assigned=False,
+ label=label,
+ span_setter=span_setter,
)
- self.score_name = score_name
-
if isinstance(score_normalization, str):
self.score_normalization = registry.get("misc", score_normalization)
else:
self.score_normalization = score_normalization
- self.set_extensions()
-
- @classmethod
- def set_extensions(cls) -> None:
- super(Score, Score).set_extensions()
+ def set_extensions(self) -> None:
+ super().set_extensions()
+ if not Span.has_extension(self.label):
+ Span.set_extension(self.label, default=None)
if not Span.has_extension("score_name"):
Span.set_extension("score_name", default=None)
if not Span.has_extension("score_value"):
- Span.set_extension("score_value", default=None)
-
- def __call__(self, doc: Doc) -> Doc:
- """
- Adds spans to document.
+ Span.set_extension("score_value", getter=lambda x: x._.value)
- Parameters
- ----------
- doc:
- spaCy Doc object
-
- Returns
- -------
- doc:
- spaCy Doc object, annotated for extracted terms.
- """
-
- ents = self.process(doc)
- ents = self.score_filtering(ents)
-
- ents, discarded = filter_spans(
- list(doc.ents) + list(ents), return_discarded=True
- )
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
-
- return doc
-
- def score_filtering(self, ents: List[Span]) -> List[Span]:
+ def process(self, doc: Doc) -> Iterable[Span]:
"""
Extracts, if available, the value of the score.
Normalizes the score via the provided `self.score_normalization` method.
Parameters
----------
- ents: List[Span]
- List of spaCy's spans extracted by the score matcher
+ doc: Doc
+ Document to process
- Returns
- -------
- ents: List[Span]
- List of spaCy's spans, with, if found, an added `score_value` extension
+ Yields
+ ------
+ Span
+ Matches with, if found, an added `score_value` extension
"""
-
- for ent in ents:
+ for ent in super().process(doc):
value = ent._.assigned.get("value", None)
if value is None:
continue
normalized_value = self.score_normalization(value)
if normalized_value is not None:
- ent._.score_name = self.score_name
- ent._.score_value = normalized_value
+ ent._.score_name = self.label
+ ent._.set(self.label, normalized_value)
yield ent
diff --git a/edsnlp/pipelines/ner/scores/charlson/factory.py b/edsnlp/pipelines/ner/scores/charlson/factory.py
index 5ab820eec..05cb4c536 100644
--- a/edsnlp/pipelines/ner/scores/charlson/factory.py
+++ b/edsnlp/pipelines/ner/scores/charlson/factory.py
@@ -1,9 +1,10 @@
import re
-from typing import Any, Callable, List, Union
+from typing import Any, Callable, List, Optional, Union
from spacy.language import Language
-from edsnlp.pipelines.ner.scores import Score
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
from edsnlp.pipelines.ner.scores.charlson import patterns
from edsnlp.utils.deprecation import deprecated_factory
@@ -16,35 +17,118 @@
ignore_excluded=False,
ignore_space_tokens=False,
flags=0,
+ label="charlson",
+ span_setter={"ents": True, "charlson": True},
)
@deprecated_factory(
"charlson",
"eds.charlson",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
)
@Language.factory(
"eds.charlson",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
- name: str,
- regex: List[str],
- value_extract: str,
- score_normalization: Union[str, Callable[[Union[str, None]], Any]],
- attr: str,
- window: int,
- ignore_excluded: bool,
- ignore_space_tokens: bool,
- flags: Union[re.RegexFlag, int],
+ name: Optional[str] = None,
+ *,
+ regex: List[str] = patterns.regex,
+ value_extract: str = patterns.value_extract,
+ score_normalization: Union[
+ str, Callable[[Union[str, None]], Any]
+ ] = patterns.score_normalization_str,
+ attr: str = "NORM",
+ window: int = 7,
+ ignore_excluded: bool = False,
+ ignore_space_tokens: bool = False,
+ flags: Union[re.RegexFlag, int] = 0,
+ label: str = "charlson",
+ span_setter: SpanSetterArg = {"ents": True, "charlson": True},
):
- return Score(
+ '''
+ The `eds.charlson` component extracts the
+ [Charlson Comorbidity Index](https://www.mdcalc.com/charlson-comorbidity-index-cci).
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.charlson")
+
+ text = """
+ Charlson à l'admission: 7.
+ Charlson:
+ OMS:
+ """
+
+ doc = nlp(text)
+ doc.ents
+ # Out: (Charlson à l'admission: 7,)
+ ```
+
+ We can see that only one occurrence was extracted. The second mention of
+ Charlson in the text doesn't contain any numerical value, so it isn't extracted.
+
+ Extensions
+ ----------
+ Each extraction exposes 2 extensions:
+
+ ```python
+ ent = doc.ents[0]
+
+ ent._.score_name
+ # Out: 'charlson'
+
+ ent._.score_value
+ # Out: 7
+ ```
+
+ Parameters
+ ----------
+ nlp : Language
+ The pipeline object
+ name : Optional[str]
+ Name of the component
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value else
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
+
+ Returns
+ -------
+ SimpleScoreMatcher
+ '''
+ return SimpleScoreMatcher(
nlp,
- score_name=name,
+ name=name,
regex=regex,
value_extract=value_extract,
score_normalization=score_normalization,
@@ -53,4 +137,6 @@ def create_component(
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
flags=flags,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/elston_ellis/__init__.py b/edsnlp/pipelines/ner/scores/elston_ellis/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/edsnlp/pipelines/ner/scores/elston_ellis/factory.py b/edsnlp/pipelines/ner/scores/elston_ellis/factory.py
new file mode 100644
index 000000000..a6b54a716
--- /dev/null
+++ b/edsnlp/pipelines/ner/scores/elston_ellis/factory.py
@@ -0,0 +1,118 @@
+import re
+from typing import Any, Callable, List, Optional, Union
+
+from spacy.language import Language
+
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .patterns import regex, score_normalization_str, value_extract
+
+DEFAULT_CONFIG = dict(
+ regex=regex,
+ value_extract=value_extract,
+ score_normalization=score_normalization_str,
+ attr="TEXT",
+ window=20,
+ ignore_excluded=False,
+ ignore_space_tokens=False,
+ flags=0,
+ label="elston_ellis",
+ span_setter={"ents": True, "elston_ellis": True},
+)
+
+
+@deprecated_factory(
+ "eds.elston-ellis",
+ "eds.elston_ellis",
+ assigns=["doc.ents", "doc.spans"],
+)
+@deprecated_factory(
+ "eds.elstonellis",
+ "eds.elston_ellis",
+ assigns=["doc.ents", "doc.spans"],
+)
+@Language.factory(
+ "eds.elston_ellis",
+ assigns=["doc.ents", "doc.spans"],
+)
+def create_component(
+ nlp: Language,
+ name: Optional[str] = None,
+ *,
+ regex: List[str] = regex,
+ value_extract: str = value_extract,
+ score_normalization: Union[
+ str, Callable[[Union[str, None]], Any]
+ ] = score_normalization_str,
+ attr: str = "TEXT",
+ window: int = 20,
+ ignore_excluded: bool = False,
+ ignore_space_tokens: bool = False,
+ flags: Union[re.RegexFlag, int] = 0,
+ label: str = "elston_ellis",
+ span_setter: SpanSetterArg = {"ents": True, "elston_ellis": True},
+):
+ """
+ Matcher for the Elston-Ellis score.
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.elston_ellis")
+ ```
+
+ Parameters
+ ----------
+ nlp : Language
+ The pipeline object
+ name : str
+ The name of the component
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value else
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
+
+ Returns
+ -------
+ SimpleScoreMatcher
+ """
+ return SimpleScoreMatcher(
+ nlp,
+ name=name,
+ regex=regex,
+ value_extract=value_extract,
+ score_normalization=score_normalization,
+ attr=attr,
+ window=window,
+ ignore_excluded=ignore_excluded,
+ ignore_space_tokens=ignore_space_tokens,
+ flags=flags,
+ label=label,
+ span_setter=span_setter,
+ )
diff --git a/edsnlp/pipelines/ner/scores/elstonellis/patterns.py b/edsnlp/pipelines/ner/scores/elston_ellis/patterns.py
similarity index 100%
rename from edsnlp/pipelines/ner/scores/elstonellis/patterns.py
rename to edsnlp/pipelines/ner/scores/elston_ellis/patterns.py
diff --git a/edsnlp/pipelines/ner/scores/elstonellis/factory.py b/edsnlp/pipelines/ner/scores/elstonellis/factory.py
deleted file mode 100644
index 8bdac8076..000000000
--- a/edsnlp/pipelines/ner/scores/elstonellis/factory.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import re
-from typing import Any, Callable, List, Union
-
-from spacy.language import Language
-
-from edsnlp.pipelines.ner.scores import Score
-from edsnlp.pipelines.ner.scores.elstonellis import patterns
-
-DEFAULT_CONFIG = dict(
- regex=patterns.regex,
- value_extract=patterns.value_extract,
- score_normalization=patterns.score_normalization_str,
- attr="TEXT",
- window=20,
- ignore_excluded=False,
- ignore_space_tokens=False,
- flags=0,
-)
-
-
-@Language.factory(
- "eds.elston-ellis",
- default_config=DEFAULT_CONFIG,
- assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- regex: List[str] = patterns.regex,
- value_extract: str = patterns.value_extract,
- score_normalization: Union[
- str, Callable[[Union[str, None]], Any]
- ] = patterns.score_normalization_str,
- attr: str = "TEXT",
- window: int = 20,
- ignore_excluded: bool = False,
- ignore_space_tokens: bool = False,
- flags: Union[re.RegexFlag, int] = 0,
-):
- """
- Matcher for the Elston-Ellis score.
-
- Parameters
- ----------
- nlp: Language
- The spaCy Language object
- name: str
- The name of the component
- regex: List[str]
- The regex patterns to match
- value_extract: str
- The regex pattern to extract the value from the matched text
- score_normalization: Union[str, Callable[[Union[str, None]], Any]]
- The normalization function to apply to the extracted value
- attr: str
- The token attribute to match on (e.g. "TEXT" or "NORM")
- window: int
- The window size to search for the regex pattern
- ignore_excluded: bool
- Whether to ignore excluded tokens
- ignore_space_tokens: bool
- Whether to ignore space tokens
- flags: Union[re.RegexFlag, int]
- The regex flags to use
-
- Returns
- -------
- Score
- """
- return Score(
- nlp,
- score_name=name,
- regex=regex,
- value_extract=value_extract,
- score_normalization=score_normalization,
- attr=attr,
- window=window,
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- flags=flags,
- )
diff --git a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
index 1d91ae4a0..3fedee460 100644
--- a/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/ccmu/factory.py
@@ -1,9 +1,10 @@
import re
-from typing import Any, Callable, List, Union
+from typing import Any, Callable, List, Optional, Union
from spacy.language import Language
-from edsnlp.pipelines.ner.scores import Score
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
from edsnlp.pipelines.ner.scores.emergency.ccmu import patterns
from edsnlp.utils.deprecation import deprecated_factory
@@ -14,24 +15,31 @@
attr="NORM",
window=20,
ignore_excluded=False,
+ ignore_space_tokens=False,
flags=0,
+ label="emergency_ccmu",
+ span_setter={"ents": True, "emergency_ccmu": True},
)
@deprecated_factory(
"emergency.ccmu",
+ "eds.emergency_ccmu",
+ assigns=["doc.ents", "doc.spans"],
+)
+@deprecated_factory(
"eds.emergency.ccmu",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_ccmu",
assigns=["doc.ents", "doc.spans"],
)
@Language.factory(
- "eds.emergency.ccmu",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_ccmu",
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
- name: str = "eds.emergency.ccmu",
+ name: Optional[str] = None,
+ *,
regex: List[str] = patterns.regex,
value_extract: str = patterns.value_extract,
score_normalization: Union[
@@ -42,40 +50,62 @@ def create_component(
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
flags: Union[re.RegexFlag, int] = 0,
+ label: str = "emergency_ccmu",
+ span_setter: SpanSetterArg = {"ents": True, "emergency_ccmu": True},
):
"""
- Matcher for the Emergency CCMU score.
+ Matcher for explicit mentions of the French
+ [CCMU emergency score](http://medicalcul.free.fr/ccmu.html).
+
+ Examples
+ --------
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.emergency_ccmu")
+ ```
Parameters
----------
- nlp: Language
- The spaCy Language object
- name: str
+ nlp : Language
+ The pipeline object
+ name : Optional[str]
The name of the component
- regex: List[str]
- The regex patterns to match
- value_extract: str
- The regex pattern to extract the value from the matched text
- score_normalization: Union[str, Callable[[Union[str, None]], Any]]
- The normalization function to apply to the extracted value
- attr: str
- The token attribute to match on (e.g. "TEXT" or "NORM")
- window: int
- The window size to search for the regex pattern
- ignore_excluded: bool
- Whether to ignore excluded tokens
- ignore_space_tokens: bool
- Whether to ignore space tokens
- flags: Union[re.RegexFlag, int]
- The regex flags to use
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value otherwise
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
Returns
-------
- Score
+ SimpleScoreMatcher
"""
- return Score(
+ return SimpleScoreMatcher(
nlp,
- score_name=name,
+ name=name,
regex=regex,
value_extract=value_extract,
score_normalization=score_normalization,
@@ -84,4 +114,6 @@ def create_component(
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
flags=flags,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
index 6647e50b9..f1c4c4c2a 100644
--- a/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/gemsa/factory.py
@@ -1,9 +1,10 @@
import re
-from typing import Any, Callable, List, Union
+from typing import Any, Callable, List, Optional, Union
from spacy.language import Language
-from edsnlp.pipelines.ner.scores import Score
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
from edsnlp.pipelines.ner.scores.emergency.gemsa import patterns
from edsnlp.utils.deprecation import deprecated_factory
@@ -14,24 +15,31 @@
attr="NORM",
window=20,
ignore_excluded=False,
+ ignore_space_tokens=False,
flags=0,
+ label="emergency_gemsa",
+ span_setter={"ents": True, "emergency_gemsa": True},
)
@deprecated_factory(
"emergency.gemsa",
+ "eds.emergency_gemsa",
+ assigns=["doc.ents", "doc.spans"],
+)
+@deprecated_factory(
"eds.emergency.gemsa",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_gemsa",
assigns=["doc.ents", "doc.spans"],
)
@Language.factory(
- "eds.emergency.gemsa",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_gemsa",
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
- name: str = "eds.emergency.gemsa",
+ name: Optional[str] = None,
+ *,
regex: List[str] = patterns.regex,
value_extract: str = patterns.value_extract,
score_normalization: Union[
@@ -42,40 +50,62 @@ def create_component(
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
flags: Union[re.RegexFlag, int] = 0,
+ label: str = "emergency_gemsa",
+ span_setter: SpanSetterArg = {"ents": True, "emergency_gemsa": True},
):
"""
- Matcher for the Emergency CCMU score.
+ Matcher for explicit mentions of the French
+ [GEMSA emergency score](http://medicalcul.free.fr/gemsa.html).
+
+ Examples
+ --------
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.emergency_gemsa")
+ ```
Parameters
----------
- nlp: Language
- The spaCy Language object
- name: str
+ nlp : Language
+ The pipeline object
+ name : str
The name of the component
- regex: List[str]
- The regex patterns to match
- value_extract: str
- The regex pattern to extract the value from the matched text
- score_normalization: Union[str, Callable[[Union[str, None]], Any]]
- The normalization function to apply to the extracted value
- attr: str
- The token attribute to match on (e.g. "TEXT" or "NORM")
- window: int
- The window size to search for the regex pattern
- ignore_excluded: bool
- Whether to ignore excluded tokens
- ignore_space_tokens: bool
- Whether to ignore space tokens
- flags: Union[re.RegexFlag, int]
- The regex flags to use
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value otherwise
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
Returns
-------
- Score
+ SimpleScoreMatcher
"""
- return Score(
+ return SimpleScoreMatcher(
nlp,
- score_name=name,
+ name=name,
regex=regex,
value_extract=value_extract,
score_normalization=score_normalization,
@@ -84,4 +114,6 @@ def create_component(
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
flags=flags,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
index fd7235ebc..671d01aa5 100644
--- a/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
+++ b/edsnlp/pipelines/ner/scores/emergency/priority/factory.py
@@ -1,81 +1,111 @@
import re
-from typing import Any, Callable, List, Union
+from typing import Any, Callable, List, Optional, Union
from spacy.language import Language
-from edsnlp.pipelines.ner.scores import Score
-from edsnlp.pipelines.ner.scores.emergency.priority import patterns
+from edsnlp.pipelines.base import SpanSetterArg
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
from edsnlp.utils.deprecation import deprecated_factory
+from .patterns import regex, score_normalization_str, value_extract
+
DEFAULT_CONFIG = dict(
- regex=patterns.regex,
- value_extract=patterns.value_extract,
- score_normalization=patterns.score_normalization_str,
+ regex=regex,
+ value_extract=value_extract,
+ score_normalization=score_normalization_str,
attr="NORM",
window=7,
ignore_excluded=False,
+ ignore_space_tokens=False,
flags=0,
+ label="emergency_priority",
+ span_setter={"ents": True, "emergency_priority": True},
)
@deprecated_factory(
"emergency.priority",
+ "eds.emergency_priority",
+ assigns=["doc.ents", "doc.spans"],
+)
+@deprecated_factory(
"eds.emergency.priority",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_priority",
assigns=["doc.ents", "doc.spans"],
)
@Language.factory(
- "eds.emergency.priority",
- default_config=DEFAULT_CONFIG,
+ "eds.emergency_priority",
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
- name: str = "emergency.priority",
- regex: List[str] = patterns.regex,
- value_extract: str = patterns.value_extract,
+ name: Optional[str] = None,
+ *,
+ regex: List[str] = regex,
+ value_extract: str = value_extract,
score_normalization: Union[
str, Callable[[Union[str, None]], Any]
- ] = patterns.score_normalization_str,
+ ] = score_normalization_str,
attr: str = "NORM",
window: int = 7,
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
flags: Union[re.RegexFlag, int] = 0,
+ label: str = "emergency_priority",
+ span_setter: SpanSetterArg = {"ents": True, "emergency_priority": True},
):
"""
- Matcher for the Emergency Priority score.
+ Matcher for explicit mentions of the French priority emergency score.
+
+ Examples
+ --------
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.emergency_priority")
+ ```
Parameters
----------
- nlp: Language
- The spaCy Language object
- name: str
+ nlp : Language
+ The pipeline object
+ name : str
The name of the component
- regex: List[str]
- The regex patterns to match
- value_extract: str
- The regex pattern to extract the value from the matched text
- score_normalization: Union[str, Callable[[Union[str, None]], Any]]
- The normalization function to apply to the extracted value
- attr: str
- The token attribute to match on (e.g. "TEXT" or "NORM")
- window: int
- The window size to search for the regex pattern
- ignore_excluded: bool
- Whether to ignore excluded tokens
- ignore_space_tokens: bool
- Whether to ignore space tokens
- flags: Union[re.RegexFlag, int]
- The regex flags to use
+ regex : List[str]
+ A list of regexes to identify the score
+ attr : str
+ Whether to match on the text ('TEXT') or on the normalized text ('NORM')
+ value_extract : str
+ Regex with capturing group to get the score value
+ score_normalization : Union[str, Callable[[Union[str,None]], Any]]
+ Function that takes the "raw" value extracted from the `value_extract`
+ regex and should return:
+
+ - None if no score could be extracted
+ - The desired score value else
+ window : int
+ Number of token to include after the score's mention to find the
+ score's value
+ ignore_excluded : bool
+ Whether to ignore excluded spans when matching
+ ignore_space_tokens : bool
+ Whether to ignore space tokens when matching
+ flags : Union[re.RegexFlag, int]
+ Regex flags to use when matching
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
Returns
-------
- Score
+ SimpleScoreMatcher
"""
- return Score(
+ return SimpleScoreMatcher(
nlp,
- score_name=name,
+ name=name,
regex=regex,
value_extract=value_extract,
score_normalization=score_normalization,
@@ -84,4 +114,6 @@ def create_component(
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
flags=flags,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/factory.py b/edsnlp/pipelines/ner/scores/factory.py
index 23b7fc484..c5d74a50b 100644
--- a/edsnlp/pipelines/ner/scores/factory.py
+++ b/edsnlp/pipelines/ner/scores/factory.py
@@ -1,84 +1,27 @@
-import re
-from typing import Any, Callable, List, Union
-
from spacy.language import Language
-from edsnlp.pipelines.ner.scores import Score
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
from edsnlp.utils.deprecation import deprecated_factory
DEFAULT_CONFIG = dict(
+ regex=None,
attr="NORM",
+ value_extract=None,
+ score_normalization=None,
window=7,
ignore_excluded=False,
ignore_space_tokens=False,
flags=0,
+ span_setter={"ents": True},
)
-
-@deprecated_factory(
+create_component = SimpleScoreMatcher
+create_component = deprecated_factory(
"score",
"eds.score",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-@Language.factory(
+)(create_component)
+create_component = Language.factory(
"eds.score",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str = "eds.score",
- score_name: str = None,
- regex: List[str] = None,
- value_extract: str = None,
- score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None,
- attr: str = "NORM",
- window: int = 7,
- flags: Union[re.RegexFlag, int] = 0,
- ignore_excluded: bool = False,
- ignore_space_tokens: bool = False,
-):
- """
- Parameters
- ----------
- nlp : Language
- The spaCy object.
- name : str
- The name of the component.
- score_name : str
- The name of the extracted score
- regex : List[str]
- A list of regexes to identify the score
- attr : str
- Whether to match on the text ('TEXT') or on the normalized text ('NORM')
- value_extract : str
- Regex with capturing group to get the score value
- score_normalization : Callable[[Union[str,None]], Any]
- Function that takes the "raw" value extracted from the `value_extract` regex,
- and should return:
-
- - None if no score could be extracted
- - The desired score value else
- window : int
- Number of token to include after the score's mention to find the
- score's value
- ignore_excluded : bool
- Whether to ignore excluded spans when matching
- ignore_space_tokens : bool
- Whether to ignore space tokens when matching
- flags : Union[re.RegexFlag, int]
- Regex flags to use when matching
- """
- return Score(
- nlp,
- score_name=score_name,
- regex=regex,
- value_extract=value_extract,
- score_normalization=score_normalization,
- attr=attr,
- flags=flags,
- window=window,
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/ner/scores/sofa/__init__.py b/edsnlp/pipelines/ner/scores/sofa/__init__.py
index 7ba4173c3..e69de29bb 100644
--- a/edsnlp/pipelines/ner/scores/sofa/__init__.py
+++ b/edsnlp/pipelines/ner/scores/sofa/__init__.py
@@ -1 +0,0 @@
-from .sofa import Sofa
diff --git a/edsnlp/pipelines/ner/scores/sofa/factory.py b/edsnlp/pipelines/ner/scores/sofa/factory.py
index 8c9b1fb97..59395d0dd 100644
--- a/edsnlp/pipelines/ner/scores/sofa/factory.py
+++ b/edsnlp/pipelines/ner/scores/sofa/factory.py
@@ -1,57 +1,112 @@
import re
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List, Optional, Union
from spacy.language import Language
-from edsnlp.pipelines.ner.scores.sofa import Sofa, patterns
+from edsnlp.pipelines.base import SpanSetterArg
from edsnlp.utils.deprecation import deprecated_factory
+from .patterns import regex, score_normalization_str, value_extract
+from .sofa import SofaMatcher
+
DEFAULT_CONFIG = dict(
- regex=patterns.regex,
- value_extract=patterns.value_extract,
- score_normalization=patterns.score_normalization_str,
+ regex=regex,
+ value_extract=value_extract,
+ score_normalization=score_normalization_str,
attr="NORM",
window=10,
ignore_excluded=False,
ignore_space_tokens=False,
flags=0,
+ label="sofa",
+ span_setter={"ents": True, "sofa": True},
)
@deprecated_factory(
"SOFA",
"eds.SOFA",
- default_config=DEFAULT_CONFIG,
assigns=["doc.ents", "doc.spans"],
)
-@Language.factory(
+@deprecated_factory(
"eds.SOFA",
- default_config=DEFAULT_CONFIG,
+ "eds.sofa",
+ assigns=["doc.ents", "doc.spans"],
+)
+@Language.factory(
+ "eds.sofa",
assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
- name: str,
- regex: List[str] = patterns.regex,
- value_extract: List[Dict[str, str]] = patterns.value_extract,
+ name: Optional[str] = None,
+ *,
+ regex: List[str] = regex,
+ value_extract: List[Dict[str, str]] = value_extract,
score_normalization: Union[
str, Callable[[Union[str, None]], Any]
- ] = patterns.score_normalization_str,
+ ] = score_normalization_str,
attr: str = "NORM",
window: int = 10,
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
flags: Union[re.RegexFlag, int] = 0,
+ label: str = "sofa",
+ span_setter: SpanSetterArg = {"ents": True, "sofa": True},
):
+ '''
+ The `eds.sofa` component extracts
+ [Sequential Organ Failure Assessment (SOFA) scores](\
+ https://www.mdcalc.com/calc/691/sequential-organ-failure-assessment-sofa-score),
+ used to track a person's status during the stay in an intensive care unit to
+ determine the extent of a person's organ function or rate failure.
+
+ Examples
+ --------
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.sofa")
+
+ text = """
+ SOFA (à 24H) : 12.
+ OMS:
"""
- Matcher component to extract the SOFA score
+
+ doc = nlp(text)
+ doc.ents
+ # Out: (SOFA (à 24H) : 12,)
+ ```
+
+ Extensions
+ ----------
+ Each extraction exposes 3 extensions:
+
+ ```python
+ ent = doc.ents[0]
+
+ ent._.score_name
+ # Out: 'sofa'
+
+ ent._.score_value
+ # Out: 12
+
+ ent._.score_method
+ # Out: '24H'
+ ```
+
+ Score method can here be "24H", "Maximum", "A l'admission" or "Non précisée"
Parameters
----------
nlp : Language
- The spaCy object.
- name : str
- The name of the extracted score
+ The pipeline object
+ name : Optional[str]
+ The name of the component
regex : List[str]
A list of regexes to identify the SOFA score
attr : str
@@ -72,10 +127,14 @@ def create_component(
Whether to ignore space tokens
flags : Union[re.RegexFlag, int]
Flags to pass to the regex
- """
- return Sofa(
+ label: str
+ Label name to use for the `Span` object and the extension
+ span_setter: SpanSetterArg
+ How to set matches on the doc
+ '''
+ return SofaMatcher(
nlp,
- score_name=name,
+ name=name,
regex=regex,
value_extract=value_extract,
score_normalization=score_normalization,
@@ -84,4 +143,6 @@ def create_component(
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
flags=flags,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/ner/scores/sofa/sofa.py b/edsnlp/pipelines/ner/scores/sofa/sofa.py
index 0ea49962e..6f52e1f98 100644
--- a/edsnlp/pipelines/ner/scores/sofa/sofa.py
+++ b/edsnlp/pipelines/ner/scores/sofa/sofa.py
@@ -1,86 +1,25 @@
-import re
-from typing import Any, Callable, Dict, List, Union
+from typing import Iterable
-from spacy.language import Language
-from spacy.tokens import Span
+from spacy.tokens import Doc, Span
-from edsnlp.pipelines.ner.scores import Score
+from edsnlp.pipelines.ner.scores.base_score import SimpleScoreMatcher
-class Sofa(Score):
- """
- Matcher component to extract the SOFA score
-
- Parameters
- ----------
- nlp : Language
- The spaCy object.
- regex : List[str]
- A list of regexes to identify the SOFA score
- attr : str
- Whether to match on the text ('TEXT') or on the normalized text ('CUSTOM_NORM')
- value_extract : Dict[str, str]
- Regex to extract the score value
- score_normalization : Callable[[Union[str,None]], Any]
- Function that takes the "raw" value extracted from the `value_extract` regex,
- and should return
- - None if no score could be extracted
- - The desired score value else
- window : int
- Number of token to include after the score's mention to find the
- score's value
- ignore_excluded : bool
- Whether to ignore excluded spans
- ignore_space_tokens : bool
- Whether to ignore space tokens
- flags : Union[re.RegexFlag, int]
- Flags to pass to the regex
- """
-
- def __init__(
- self,
- nlp: Language,
- score_name: str,
- regex: List[str],
- attr: str,
- value_extract: List[Dict[str, str]],
- score_normalization: Union[str, Callable[[Union[str, None]], Any]],
- window: int,
- flags: Union[re.RegexFlag, int],
- ignore_excluded: bool,
- ignore_space_tokens: bool,
- ):
-
- super().__init__(
- nlp,
- score_name=score_name,
- regex=regex,
- value_extract=value_extract,
- score_normalization=score_normalization,
- attr=attr,
- window=window,
- flags=flags,
- ignore_excluded=ignore_excluded,
- ignore_space_tokens=ignore_space_tokens,
- )
-
- self.set_extensions()
-
- @classmethod
- def set_extensions(cls) -> None:
- super(Sofa, Sofa).set_extensions()
+class SofaMatcher(SimpleScoreMatcher):
+ def set_extensions(self):
+ super().set_extensions()
if not Span.has_extension("score_method"):
Span.set_extension("score_method", default=None)
- def score_filtering(self, ents: List[Span]) -> List[Span]:
+ def process(self, doc: Doc) -> Iterable[Span]:
"""
Extracts, if available, the value of the score.
Normalizes the score via the provided `self.score_normalization` method.
Parameters
----------
- ents: List[Span]
- List of spaCy's spans extracted by the score matcher
+ doc: Doc
+ Document to process
Returns
-------
@@ -88,7 +27,7 @@ def score_filtering(self, ents: List[Span]) -> List[Span]:
List of spaCy's spans, with, if found, an added `score_value` extension
"""
- for ent in ents:
+ for ent in super().process(doc):
assigned = ent._.assigned
if not assigned:
continue
@@ -104,8 +43,8 @@ def score_filtering(self, ents: List[Span]) -> List[Span]:
normalized_value = self.score_normalization(assigned["value"])
if normalized_value is not None:
- ent._.score_name = self.score_name
- ent._.score_value = int(normalized_value)
+ ent._.set(self.label, int(normalized_value))
+ ent._.score_name = self.label
ent._.score_method = method
yield ent
diff --git a/edsnlp/pipelines/ner/scores/tnm/__init__.py b/edsnlp/pipelines/ner/scores/tnm/__init__.py
deleted file mode 100644
index 02793314a..000000000
--- a/edsnlp/pipelines/ner/scores/tnm/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .tnm import TNM
diff --git a/edsnlp/pipelines/ner/scores/tnm/factory.py b/edsnlp/pipelines/ner/scores/tnm/factory.py
deleted file mode 100644
index 9617cfa23..000000000
--- a/edsnlp/pipelines/ner/scores/tnm/factory.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import List, Optional, Union
-
-from spacy.language import Language
-
-from .tnm import TNM
-
-DEFAULT_CONFIG = dict(
- pattern=None,
- attr="TEXT",
-)
-
-
-@Language.factory(
- "eds.TNM",
- default_config=DEFAULT_CONFIG,
- assigns=["doc.ents", "doc.spans"],
-)
-def create_component(
- nlp: Language,
- name: str,
- pattern: Optional[Union[List[str], str]],
- attr: str,
-):
- return TNM(
- nlp,
- pattern=pattern,
- attr=attr,
- )
diff --git a/edsnlp/pipelines/ner/scores/tnm/tnm.py b/edsnlp/pipelines/ner/scores/tnm/tnm.py
deleted file mode 100644
index f5c6a94bd..000000000
--- a/edsnlp/pipelines/ner/scores/tnm/tnm.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""`eds.tnm` pipeline."""
-from typing import Dict, List, Optional, Tuple, Union
-
-from pydantic import ValidationError
-from spacy.language import Language
-from spacy.tokens import Doc, Span
-
-from edsnlp.matchers.regex import RegexMatcher
-from edsnlp.pipelines.base import BaseComponent
-from edsnlp.utils.filter import filter_spans
-
-from . import models, patterns
-
-PERIOD_PROXIMITY_THRESHOLD = 3
-
-
-class TNM(BaseComponent):
- """
- Tags and normalizes TNM mentions.
-
- Parameters
- ----------
- nlp : spacy.language.Language
- Language pipeline object
- pattern : Optional[Union[List[str], str]]
- List of regular expressions for TNM mentions.
- attr : str
- spaCy attribute to use
- """
-
- # noinspection PyProtectedMember
- def __init__(
- self,
- nlp: Language,
- pattern: Optional[Union[List[str], str]],
- attr: str,
- ):
-
- self.nlp = nlp
-
- if pattern is None:
- pattern = patterns.tnm_pattern
-
- if isinstance(pattern, str):
- pattern = [pattern]
-
- self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
- self.regex_matcher.add("tnm", pattern)
-
- self.set_extensions()
-
- @classmethod
- def set_extensions(cls) -> None:
- """
- Set extensions for the dates pipeline.
- """
-
- if not Span.has_extension("value"):
- Span.set_extension("value", default=None)
-
- def process(self, doc: Doc) -> List[Span]:
- """
- Find TNM mentions in doc.
-
- Parameters
- ----------
- doc:
- spaCy Doc object
-
- Returns
- -------
- spans:
- list of tnm spans
- """
-
- spans = self.regex_matcher(
- doc,
- as_spans=True,
- return_groupdict=True,
- )
-
- spans = filter_spans(spans)
-
- return spans
-
- def parse(self, spans: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
- """
- Parse dates using the groupdict returned by the matcher.
-
- Parameters
- ----------
- spans : List[Tuple[Span, Dict[str, str]]]
- List of tuples containing the spans and groupdict
- returned by the matcher.
-
- Returns
- -------
- List[Span]
- List of processed spans, with the date parsed.
- """
-
- for span, groupdict in spans:
- try:
- span._.value = models.TNM.parse_obj(groupdict)
- except ValidationError:
- span._.value = models.TNM.parse_obj({})
-
- span.kb_id_ = span._.value.norm()
-
- return [span for span, _ in spans]
-
- def __call__(self, doc: Doc) -> Doc:
- """
- Tags TNM mentions.
-
- Parameters
- ----------
- doc : Doc
- spaCy Doc object
-
- Returns
- -------
- doc : Doc
- spaCy Doc object, annotated for TNM
- """
- spans = self.process(doc)
- spans = filter_spans(spans)
-
- spans = self.parse(spans)
-
- doc.spans["tnm"] = spans
-
- ents, discarded = filter_spans(list(doc.ents) + spans, return_discarded=True)
-
- doc.ents = ents
-
- if "discarded" not in doc.spans:
- doc.spans["discarded"] = []
- doc.spans["discarded"].extend(discarded)
-
- return doc
diff --git a/edsnlp/pipelines/ner/tnm/__init__.py b/edsnlp/pipelines/ner/tnm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/edsnlp/pipelines/ner/tnm/factory.py b/edsnlp/pipelines/ner/tnm/factory.py
new file mode 100644
index 000000000..7832829c8
--- /dev/null
+++ b/edsnlp/pipelines/ner/tnm/factory.py
@@ -0,0 +1,24 @@
+from spacy.language import Language
+
+from edsnlp.utils.deprecation import deprecated_factory
+
+from .patterns import tnm_pattern
+from .tnm import TNMMatcher
+
+DEFAULT_CONFIG = dict(
+ pattern=tnm_pattern,
+ attr="TEXT",
+ label="tnm",
+ span_setter={"ents": True, "tnm": True},
+)
+
+create_component = TNMMatcher
+create_component = deprecated_factory(
+ "eds.TNM",
+ "eds.tnm",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
+create_component = Language.factory(
+ "eds.tnm",
+ assigns=["doc.ents", "doc.spans"],
+)(create_component)
diff --git a/edsnlp/pipelines/ner/scores/tnm/models.py b/edsnlp/pipelines/ner/tnm/model.py
similarity index 98%
rename from edsnlp/pipelines/ner/scores/tnm/models.py
rename to edsnlp/pipelines/ner/tnm/model.py
index 683ed0743..b879b91b6 100644
--- a/edsnlp/pipelines/ner/scores/tnm/models.py
+++ b/edsnlp/pipelines/ner/tnm/model.py
@@ -137,6 +137,9 @@ def norm(self) -> str:
return "".join(norm)
+ def __str__(self):
+ return self.norm()
+
def dict(
self,
*,
diff --git a/edsnlp/pipelines/ner/scores/tnm/patterns.py b/edsnlp/pipelines/ner/tnm/patterns.py
similarity index 100%
rename from edsnlp/pipelines/ner/scores/tnm/patterns.py
rename to edsnlp/pipelines/ner/tnm/patterns.py
diff --git a/edsnlp/pipelines/ner/tnm/tnm.py b/edsnlp/pipelines/ner/tnm/tnm.py
new file mode 100644
index 000000000..543666ccc
--- /dev/null
+++ b/edsnlp/pipelines/ner/tnm/tnm.py
@@ -0,0 +1,169 @@
+"""`eds.tnm` pipeline."""
+from typing import Dict, List, Optional, Tuple, Union
+
+from pydantic import ValidationError
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+
+from edsnlp.matchers.regex import RegexMatcher
+from edsnlp.pipelines.base import BaseNERComponent, SpanSetterArg
+from edsnlp.utils.filter import filter_spans
+
+from .model import TNM
+from .patterns import tnm_pattern
+
+
+class TNMMatcher(BaseNERComponent):
+ """
+ The `eds.tnm` component extracts [TNM](https://enwp.org/wiki/TNM_staging_system)
+ mentions from clinical documents.
+
+ Examples
+ --------
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe("eds.tnm")
+
+ text = "TNM: pTx N1 M1"
+
+ doc = nlp(text)
+ doc.ents
+ # Out: (pTx N1 M1,)
+
+ ent = doc.ents[0]
+ ent._.tnm.dict()
+ # {'modifier': 'p',
+ # 'tumour': None,
+ # 'tumour_specification': 'x',
+ # 'node': '1',
+ # 'node_specification': None,
+ # 'metastasis': '1',
+ # 'resection_completeness': None,
+ # 'version': None,
+ # 'version_year': None}
+ ```
+
+ Parameters
+ ----------
+ nlp : Optional[Language]
+ The pipeline object
+ name : str
+ The name of the pipe
+ pattern : Optional[Union[List[str], str]]
+ The regex pattern to use for matching ADICAP codes
+ attr : str
+ Attribute to match on, eg `TEXT`, `NORM`, etc.
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+
+ Authors and citation
+ --------------------
+ The TNM score is based on the development of S. Priou, B. Rance and
+ E. Kempf ([@kempf:hal-03519085]).
+ """
+
+ # noinspection PyProtectedMember
+ def __init__(
+ self,
+ nlp: Optional[Language],
+ name: str = "eds.tnm",
+ *,
+ pattern: Optional[Union[List[str], str]] = tnm_pattern,
+ attr: str = "TEXT",
+ label: str = "tnm",
+ span_setter: SpanSetterArg = {"ents": True, "tnm": True},
+ ):
+ self.label = label
+
+ super().__init__(nlp=nlp, name=name, span_setter=span_setter)
+
+ if isinstance(pattern, str):
+ pattern = [pattern]
+
+ self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
+ self.regex_matcher.add(self.label, pattern)
+
+ def set_extensions(self) -> None:
+ """
+ Set spaCy extensions
+ """
+ super().set_extensions()
+
+ if not Span.has_extension(self.label):
+ Span.set_extension(self.label, default=None)
+
+ def process(self, doc: Doc) -> List[Span]:
+ """
+ Find TNM mentions in doc.
+
+ Parameters
+ ----------
+ doc:
+ spaCy Doc object
+
+ Returns
+ -------
+ spans:
+ list of tnm spans
+ """
+
+ spans = self.regex_matcher(
+ doc,
+ as_spans=True,
+ return_groupdict=True,
+ )
+
+ spans = filter_spans(spans)
+
+ return spans
+
+ def parse(self, spans: List[Tuple[Span, Dict[str, str]]]) -> List[Span]:
+ """
+ Parse dates using the groupdict returned by the matcher.
+
+ Parameters
+ ----------
+ spans : List[Tuple[Span, Dict[str, str]]]
+ List of tuples containing the spans and groupdict
+ returned by the matcher.
+
+ Returns
+ -------
+ List[Span]
+ List of processed spans, with the date parsed.
+ """
+
+ for span, groupdict in spans:
+ try:
+ value = TNM.parse_obj(groupdict)
+ except ValidationError:
+ value = TNM.parse_obj({})
+
+ span._.set(self.label, value)
+ span.kb_id_ = value.norm()
+
+ return [span for span, _ in spans]
+
+ def __call__(self, doc: Doc) -> Doc:
+ """
+ Tags TNM mentions.
+
+ Parameters
+ ----------
+ doc : Doc
+ spaCy Doc object
+
+ Returns
+ -------
+ doc : Doc
+ spaCy Doc object, annotated for TNM
+ """
+ spans = self.process(doc)
+ spans = self.parse(spans)
+ self.set_spans(doc, spans)
+ return doc
diff --git a/edsnlp/pipelines/ner/umls/factory.py b/edsnlp/pipelines/ner/umls/factory.py
index 8e199d98a..4b860f3ca 100644
--- a/edsnlp/pipelines/ner/umls/factory.py
+++ b/edsnlp/pipelines/ner/umls/factory.py
@@ -1,73 +1,138 @@
from typing import Any, Dict, Union
from spacy.language import Language
+from typing_extensions import Literal
-from edsnlp.pipelines.core.terminology import TerminologyMatcher, TerminologyTermMatcher
+from edsnlp.pipelines.core.terminology.terminology import TerminologyMatcher
-from . import patterns
+from ...base import SpanSetterArg
+from .patterns import get_patterns
DEFAULT_CONFIG = dict(
attr="NORM",
ignore_excluded=False,
ignore_space_tokens=False,
- term_matcher=TerminologyTermMatcher.exact,
+ term_matcher="exact",
term_matcher_config={},
- pattern_config=dict(
- languages=["FRE"],
- sources=None,
- ),
+ pattern_config=dict(languages=["FRE"], sources=None),
+ label="umls",
+ span_setter={"ents": True, "umls": True},
)
@Language.factory(
- "eds.umls", default_config=DEFAULT_CONFIG, assigns=["doc.ents", "doc.spans"]
+ "eds.umls",
+ assigns=["doc.ents", "doc.spans"],
)
def create_component(
nlp: Language,
name: str = "eds.umls",
+ *,
attr: Union[str, Dict[str, str]] = "NORM",
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
- term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
+ term_matcher: Literal["exact", "simstring"] = "exact",
term_matcher_config: Dict[str, Any] = {},
- pattern_config: Dict[str, Any] = dict(
- languages=["FRE"],
- sources=None,
- ),
+ pattern_config: Dict[str, Any] = dict(languages=["FRE"], sources=None),
+ label: str = "umls",
+ span_setter: SpanSetterArg = {"ents": True, "umls": True},
):
"""
- Create a component to recognize and normalize terms in document that
- normalize to UMLS concepts.
+ The `eds.umls` pipeline component matches the UMLS (Unified Medical Language System
+ from NIH) terminology.
+
+ !!! warning "Very low recall"
+
+ When using the `exact` matching mode, this component has a very poor recall
+ performance. We can use the `simstring` mode to retrieve approximate matches,
+ albeit at the cost of a significantly higher computation time.
+
+ Examples
+ --------
+ `eds.umls` is an additional module that needs to be setup by:
+
+ 1. `pip install -U umls_downloader`
+ 2. [Signing up](https://uts.nlm.nih.gov/uts/signup-login) for a UMLS Terminology
+ Services Account. After filling a short form, you will receive your token API
+ within a few days.
+ 3. Set `UMLS_API_KEY` locally: `export UMLS_API_KEY=your_api_key`
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.umls")
+
+ text = "Grosse toux: le malade a été mordu par des Amphibiens " "sous le genou"
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (toux, a, par, Amphibiens, genou)
+
+ ent = doc.ents[0]
+
+ ent.label_
+ # Out: umls
+
+ ent._.umls
+ # Out: C0010200
+ ```
+
+ You can easily change the default languages and sources with the `pattern_config`
+ argument:
+
+ ```python
+ import spacy
+
+ # Enable the French and English languages, through the French MeSH and LOINC
+ pattern_config = dict(languages=["FRE", "ENG"], sources=["MSHFRE", "LNC"])
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.umls", config=dict(pattern_config=pattern_config))
+ ```
+
+ See more options of languages and sources
+ [here](https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html).
Parameters
----------
- nlp: Language
+ nlp : Language
spaCy `Language` object.
- name: str
+ name : str
The name of the pipe
- attr: Union[str, Dict[str, str]]
+ attr : Union[str, Dict[str, str]]
Attribute to match on, eg `TEXT`, `NORM`, etc.
- ignore_excluded: bool
+ ignore_excluded : bool
Whether to skip excluded tokens during matching.
- ignore_space_tokens: bool
+ ignore_space_tokens : bool
Whether to skip space tokens during matching.
- term_matcher: TerminologyTermMatcher
- The term matcher to use, either `TerminologyTermMatcher.exact` or
- `TerminologyTermMatcher.simstring`
- term_matcher_config: Dict[str, Any]
+ term_matcher : TerminologyTermMatcher
+ The term matcher to use, either "exact" or "simstring"
+ term_matcher_config : Dict[str, Any]
The configuration for the term matcher
- pattern_config: Dict[str, Any]
+ pattern_config : Dict[str, Any]
The pattern retriever configuration
- """
+ label : str
+ Label name to use for the `Span` object and the extension
+ span_setter : SpanSetterArg
+ How to set matches on the doc
+ Authors and citation
+ --------------------
+ The `eds.umls` pipeline was developed by AP-HP's Data Science team and INRIA
+ SODA's team.
+ """
return TerminologyMatcher(
- nlp,
- label="umls",
- regex=None,
- terms=patterns.get_patterns(pattern_config),
+ nlp=nlp,
+ name=name,
+ regex=dict(),
+ terms=get_patterns(pattern_config),
attr=attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
term_matcher=term_matcher,
term_matcher_config=term_matcher_config,
+ label=label,
+ span_setter=span_setter,
)
diff --git a/edsnlp/pipelines/qualifiers/base.py b/edsnlp/pipelines/qualifiers/base.py
index a2ec1afee..5eb982d3d 100644
--- a/edsnlp/pipelines/qualifiers/base.py
+++ b/edsnlp/pipelines/qualifiers/base.py
@@ -6,7 +6,7 @@
from spacy.tokens import Doc, Span
from edsnlp.matchers.phrase import EDSPhraseMatcher
-from edsnlp.pipelines.base import BaseComponent
+from edsnlp.pipelines.base import BaseComponent, SpanGetterArg, validate_span_getter
def check_normalizer(nlp: Language) -> None:
@@ -29,22 +29,25 @@ def get_qualifier_extensions(nlp: Language):
return {
name: nlp.get_pipe_meta(name).assigns[0].split("span.")[-1]
for name, pipe in nlp.pipeline
- if isinstance(pipe, Qualifier)
+ if isinstance(pipe, RuleBasedQualifier)
}
-class Qualifier(BaseComponent):
+class RuleBasedQualifier(BaseComponent):
"""
Implements the NegEx algorithm.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
+ The pipeline object.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
we can also add a key for each regex.
+ span_getter : SpanGetterArg
+ Where to look for dates in the doc. By default, look in the whole doc. You can
+ combine this with the `merge_mode` argument for interesting results.
on_ents_only : Union[bool, str, List[str], Set[str]]
Whether to look for matches around detected entities only.
Useful for faster inference in downstream tasks.
@@ -63,11 +66,15 @@ class Qualifier(BaseComponent):
def __init__(
self,
nlp: Language,
+ name: Optional[str] = None,
+ *,
attr: str,
+ span_getter: SpanGetterArg,
on_ents_only: Union[bool, str, List[str], Set[str]],
explain: bool,
- **terms: Dict[str, Optional[List[str]]],
+ terms: Dict[str, Optional[List[str]]],
):
+ super().__init__(nlp=nlp, name=name)
if attr.upper() == "NORM":
check_normalizer(nlp)
@@ -77,36 +84,21 @@ def __init__(
self.on_ents_only = on_ents_only
- assert isinstance(on_ents_only, (list, str, set, bool)), (
- "The `on_ents_only` argument should be a "
- "string, a bool, a list or a set of string"
- )
+ if on_ents_only:
+ assert isinstance(on_ents_only, (list, str, set, bool)), (
+ "The `on_ents_only` argument should be a "
+ "string, a bool, a list or a set of string"
+ )
- if isinstance(on_ents_only, list):
- on_ents_only = set(on_ents_only)
- elif isinstance(on_ents_only, str):
- on_ents_only = set([on_ents_only])
- self.on_ents_only = on_ents_only
+ assert span_getter is None, (
+ "Cannot use both `on_ents_only` and " "`span_getter`"
+ )
+ span_getter = "ents" if on_ents_only is True else on_ents_only
+ else:
+ span_getter = "ents"
+ self.span_getter = validate_span_getter(span_getter)
self.explain = explain
- def get_defaults(self, **kwargs: Optional[List[str]]) -> Dict[str, List[str]]:
- """
- Merge terms with their defaults. Null keys are replaced with defaults.
-
- Returns
- -------
- Dict[str, List[str]]
- Merged dictionary
- """
- # Filter out empty keys
- kwargs = {k: v for k, v in kwargs.items() if v is not None}
-
- # Update defaults
- terms = self.defaults.copy()
- terms.update(kwargs)
-
- return terms
-
def get_matches(self, doc: Doc) -> List[Span]:
"""
Extract matches.
@@ -124,14 +116,11 @@ def get_matches(self, doc: Doc) -> List[Span]:
if self.on_ents_only:
sents = set([ent.sent for ent in self.get_spans(doc)])
- match_iterator = map(
- lambda sent: self.phrase_matcher(sent, as_spans=True), sents
- )
+ match_iterator = (self.phrase_matcher(s, as_spans=True) for s in sents)
matches = chain.from_iterable(match_iterator)
else:
-
matches = self.phrase_matcher(doc, as_spans=True)
return list(matches)
diff --git a/edsnlp/pipelines/qualifiers/family/__init__.py b/edsnlp/pipelines/qualifiers/family/__init__.py
index 5a3dc3693..3bd3ef9a0 100644
--- a/edsnlp/pipelines/qualifiers/family/__init__.py
+++ b/edsnlp/pipelines/qualifiers/family/__init__.py
@@ -1 +1,3 @@
-from .family import FamilyContext
+from .family import FamilyContextQualifier
+
+FamilyContext = FamilyContextQualifier
diff --git a/edsnlp/pipelines/qualifiers/family/factory.py b/edsnlp/pipelines/qualifiers/family/factory.py
index 703ec89f3..7576b6bd6 100644
--- a/edsnlp/pipelines/qualifiers/family/factory.py
+++ b/edsnlp/pipelines/qualifiers/family/factory.py
@@ -1,47 +1,25 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
-from edsnlp.pipelines.qualifiers.family import FamilyContext
from edsnlp.utils.deprecation import deprecated_factory
+from .family import FamilyContextQualifier
+
DEFAULT_CONFIG = dict(
+ attr="NORM",
family=None,
termination=None,
- attr="NORM",
- use_sections=False,
- explain=False,
+ use_sections=True,
+ span_getter=None,
on_ents_only=True,
+ explain=False,
)
-
-@deprecated_factory(
+create_component = deprecated_factory(
"family",
"eds.family",
- default_config=DEFAULT_CONFIG,
assigns=["span._.family"],
-)
-@Language.factory(
+)(FamilyContextQualifier)
+create_component = Language.factory(
"eds.family",
- default_config=DEFAULT_CONFIG,
assigns=["span._.family"],
-)
-def create_component(
- nlp: Language,
- name: str,
- family: Optional[List[str]],
- termination: Optional[List[str]],
- attr: str,
- explain: bool,
- on_ents_only: Union[bool, str, List[str], Set[str]],
- use_sections: bool,
-):
- return FamilyContext(
- nlp,
- family=family,
- termination=termination,
- attr=attr,
- explain=explain,
- on_ents_only=on_ents_only,
- use_sections=use_sections,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/qualifiers/family/family.py b/edsnlp/pipelines/qualifiers/family/family.py
index 0b4ce0462..47739e5a5 100644
--- a/edsnlp/pipelines/qualifiers/family/family.py
+++ b/edsnlp/pipelines/qualifiers/family/family.py
@@ -4,33 +4,89 @@
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
-from edsnlp.pipelines.qualifiers.base import Qualifier
-from edsnlp.pipelines.terminations import termination
-from edsnlp.utils.filter import consume_spans, filter_spans, get_spans
+from edsnlp.pipelines.base import SpanGetterArg, get_spans
+from edsnlp.pipelines.qualifiers.base import RuleBasedQualifier
+from edsnlp.pipelines.terminations import termination as default_termination
+from edsnlp.utils.filter import consume_spans, filter_spans
from edsnlp.utils.inclusion import check_inclusion
-from .patterns import family
+from . import patterns
-class FamilyContext(Qualifier):
+class FamilyContextQualifier(RuleBasedQualifier):
"""
- Implements a family context detection algorithm.
+ The `eds.family` component uses a simple rule-based algorithm to detect spans that
+ describe a family member (or family history) of the patient rather than the
+ patient themself.
+
+ Examples
+ --------
+ The following snippet matches a simple terminology, and checks the family context
+ of the extracted entities. It is complete, and can be run _as is_.
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ # Dummy matcher
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(terms=dict(douleur="douleur", osteoporose="ostéoporose")),
+ )
+ nlp.add_pipe("eds.family")
+
+ text = (
+ "Le patient est admis le 23 août 2021 pour une douleur au bras. "
+ "Il a des antécédents familiaux d'ostéoporose"
+ )
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (douleur, ostéoporose)
+
+ doc.ents[0]._.family
+ # Out: False
+
+ doc.ents[1]._.family
+ # Out: True
+ ```
+
+ Extensions
+ ----------
+ The `eds.family` component declares two extensions, on both `Span` and `Token`
+ objects :
- The component looks for terms indicating family references in the text.
+ 1. The `family` attribute is a boolean, set to `True` if the component predicts
+ that the span/token relates to a family member.
+ 2. The `family_` property is a human-readable string, computed from the `family`
+ attribute. It implements a simple getter function that outputs `PATIENT` or
+ `FAMILY`, depending on the value of `family`.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
- family : Optional[List[str]]
- List of terms indicating family reference.
+ The pipeline object.
+ name : Optional[str]
+ The component name.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
we can also add a key for each regex.
+ family : Optional[List[str]]
+ List of terms indicating family reference.
+ termination : Optional[List[str]]
+ List of syntagms termination terms.
+ span_getter : SpanGetterArg
+ Where to look for dates in the doc. By default, look in the whole doc. You can
+ combine this with the `merge_mode` argument for interesting results.
on_ents_only : Union[bool, str, List[str], Set[str]]
+ Deprecated, use `span_getter` instead.
+
Whether to look for matches around detected entities only.
Useful for faster inference in downstream tasks.
+
- If True, will look in all ents located in `doc.ents` only
- If an iterable of string is passed, will additionally look in `doc.spans[key]`
for each key in the iterable
@@ -38,39 +94,40 @@ class FamilyContext(Qualifier):
Whether to keep track of cues for each entity.
use_sections : bool, by default `False`
Whether to use annotated sections (namely `antécédents familiaux`).
- """
- defaults = dict(
- family=family,
- termination=termination,
- )
+ Authors and citation
+ --------------------
+ The `eds.family` component was developed by AP-HP's Data Science team.
+ """
def __init__(
self,
nlp: Language,
- attr: str,
- family: Optional[List[str]],
- termination: Optional[List[str]],
- use_sections: bool,
- explain: bool,
- on_ents_only: Union[bool, str, List[str], Set[str]],
+ name: Optional[str] = "eds.family",
+ *,
+ attr: str = "NORM",
+ family: Optional[List[str]] = None,
+ termination: Optional[List[str]] = None,
+ use_sections: bool = True,
+ span_getter: SpanGetterArg = None,
+ on_ents_only: Union[bool, str, List[str], Set[str]] = True,
+ explain: bool = False,
):
-
- terms = self.get_defaults(
- family=family,
- termination=termination,
+ terms = dict(
+ family=patterns.family if family is None else family,
+ termination=default_termination if termination is None else termination,
)
super().__init__(
nlp=nlp,
+ name=name,
attr=attr,
- on_ents_only=on_ents_only,
explain=explain,
- **terms,
+ terms=terms,
+ on_ents_only=on_ents_only,
+ span_getter=span_getter,
)
- self.set_extensions()
-
self.sections = use_sections and (
"eds.sections" in nlp.pipe_names or "sections" in nlp.pipe_names
)
@@ -81,25 +138,17 @@ def __init__(
"Skipping that step."
)
- @classmethod
- def set_extensions(cls) -> None:
- if not Token.has_extension("family"):
- Token.set_extension("family", default=False)
-
- if not Token.has_extension("family_"):
- Token.set_extension(
- "family_",
- getter=lambda token: "FAMILY" if token._.family else "PATIENT",
- )
-
- if not Span.has_extension("family"):
- Span.set_extension("family", default=False)
+ def set_extensions(self) -> None:
+ super().set_extensions()
+ for cls in (Token, Span):
+ if not cls.has_extension("family"):
+ cls.set_extension("family", default=False)
- if not Span.has_extension("family_"):
- Span.set_extension(
- "family_",
- getter=lambda span: "FAMILY" if span._.family else "PATIENT",
- )
+ if not cls.has_extension("family_"):
+ cls.set_extension(
+ "family_",
+ getter=lambda token: "FAMILY" if token._.family else "PATIENT",
+ )
if not Span.has_extension("family_cues"):
Span.set_extension("family_cues", default=[])
@@ -108,26 +157,15 @@ def set_extensions(cls) -> None:
Doc.set_extension("family", default=[])
def process(self, doc: Doc) -> Doc:
- """
- Finds entities related to family context.
-
- Parameters
- ----------
- doc: spaCy Doc object
-
- Returns
- -------
- doc: spaCy Doc object, annotated for context
- """
matches = self.get_matches(doc)
- terminations = get_spans(matches, "termination")
+ terminations = [m for m in matches if m.label_ == "termination"]
boundaries = self._boundaries(doc, terminations)
# Removes duplicate matches and pseudo-expressions in one statement
matches = filter_spans(matches, label_to_remove="pseudo")
- entities = list(self.get_spans(doc))
+ entities = list(get_spans(doc, self.span_getter))
ents = None
sections = []
@@ -140,7 +178,6 @@ def process(self, doc: Doc) -> Doc:
]
for start, end in boundaries:
-
ents, entities = consume_spans(
entities,
filter=lambda s: check_inclusion(s, start, end),
@@ -156,17 +193,12 @@ def process(self, doc: Doc) -> Doc:
if self.on_ents_only and not ents:
continue
- cues = get_spans(sub_matches, "family")
- cues += sub_sections
+ cues = [m for m in sub_matches if m.label_ == "family"]
+ cues.extend(sub_sections)
if not cues:
continue
- family = bool(cues)
-
- if not family:
- continue
-
if not self.on_ents_only:
for token in doc[start:end]:
token._.family = True
@@ -175,7 +207,7 @@ def process(self, doc: Doc) -> Doc:
ent._.family = True
if self.explain:
ent._.family_cues += cues
- if not self.on_ents_only:
+ if not self.on_ents_only and ent._.family:
for token in ent:
token._.family = True
diff --git a/edsnlp/pipelines/qualifiers/history/__init__.py b/edsnlp/pipelines/qualifiers/history/__init__.py
index f3fb5e8e1..662e37897 100644
--- a/edsnlp/pipelines/qualifiers/history/__init__.py
+++ b/edsnlp/pipelines/qualifiers/history/__init__.py
@@ -1 +1,3 @@
-from .history import History
+from .history import HistoryQualifier
+
+History = HistoryQualifier
diff --git a/edsnlp/pipelines/qualifiers/history/factory.py b/edsnlp/pipelines/qualifiers/history/factory.py
index 28b2641dd..b02ba19de 100644
--- a/edsnlp/pipelines/qualifiers/history/factory.py
+++ b/edsnlp/pipelines/qualifiers/history/factory.py
@@ -1,72 +1,31 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
-from edsnlp.pipelines.qualifiers.history import History, patterns
-from edsnlp.pipelines.terminations import termination
from edsnlp.utils.deprecation import deprecated_factory
+from .history import HistoryQualifier
+
DEFAULT_CONFIG = dict(
- attr="NORM",
- history=patterns.history,
- termination=termination,
+ history=None,
+ termination=None,
use_sections=False,
use_dates=False,
+ attr="NORM",
history_limit=14,
- exclude_birthdate=True,
closest_dates_only=True,
- explain=False,
+ exclude_birthdate=True,
+ span_getter=None,
on_ents_only=True,
+ explain=False,
)
-
-@deprecated_factory(
- "antecedents",
+create_component = HistoryQualifier
+for name in ["history", "antecedents", "eds.antecedents"]:
+ create_component = deprecated_factory(
+ name,
+ "eds.history",
+ assigns=["span._.history"],
+ )(create_component)
+create_component = Language.factory(
"eds.history",
- default_config=DEFAULT_CONFIG,
assigns=["span._.history"],
-)
-@deprecated_factory(
- "eds.antecedents",
- "eds.history",
- default_config=DEFAULT_CONFIG,
- assigns=["span._.history"],
-)
-@deprecated_factory(
- "history",
- "eds.history",
- default_config=DEFAULT_CONFIG,
- assigns=["span._.history"],
-)
-@Language.factory(
- "eds.history",
- default_config=DEFAULT_CONFIG,
- assigns=["span._.history"],
-)
-def create_component(
- nlp: Language,
- name: str,
- history: Optional[List[str]],
- termination: Optional[List[str]],
- use_sections: bool,
- use_dates: bool,
- history_limit: int,
- exclude_birthdate: bool,
- closest_dates_only: bool,
- attr: str,
- explain: bool,
- on_ents_only: Union[bool, str, List[str], Set[str]],
-):
- return History(
- nlp,
- attr=attr,
- history=history,
- termination=termination,
- use_sections=use_sections,
- use_dates=use_dates,
- history_limit=history_limit,
- exclude_birthdate=exclude_birthdate,
- closest_dates_only=closest_dates_only,
- explain=explain,
- on_ents_only=on_ents_only,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/qualifiers/history/history.py b/edsnlp/pipelines/qualifiers/history/history.py
index 689b77beb..2d6833c4a 100644
--- a/edsnlp/pipelines/qualifiers/history/history.py
+++ b/edsnlp/pipelines/qualifiers/history/history.py
@@ -6,87 +6,199 @@
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
-from edsnlp.pipelines.qualifiers.base import Qualifier
-from edsnlp.pipelines.terminations import termination
+from edsnlp.pipelines.base import SpanGetterArg, get_spans
+from edsnlp.pipelines.qualifiers.base import RuleBasedQualifier
+from edsnlp.pipelines.terminations import termination as default_termination
from edsnlp.utils.deprecation import deprecated_getter_factory
-from edsnlp.utils.filter import consume_spans, filter_spans, get_spans
+from edsnlp.utils.filter import consume_spans, filter_spans
from edsnlp.utils.inclusion import check_inclusion, check_sent_inclusion
-from .patterns import history, sections_history
+from . import patterns
+from .patterns import sections_history
-class History(Qualifier):
+class HistoryQualifier(RuleBasedQualifier):
"""
- Implements a history detection algorithm.
+ The `eds.history` pipeline uses a simple rule-based algorithm to detect spans that
+ describe medical history rather than the diagnostic of a given visit.
- The component looks for terms indicating history in the text.
+ The mere definition of a medical history is not straightforward.
+ Hence, this component only tags entities that are _explicitly described as part of
+ the medical history_, e.g., preceded by a synonym of "medical history".
+
+ This component may also use the output of:
+
+ - the [`eds.sections` component](/pipelines/misc/sections/). In that case, the
+ entire `antécédent` section is tagged as a medical history.
+
+ !!! warning "Sections"
+
+ Be careful, the `eds.sections` component may oversize the `antécédents` section.
+ Indeed, it detects *section titles* and tags the entire text between a title and
+ the next as a section. Hence, should a section title goes undetected after the
+ `antécédents` title, some parts of the document will erroneously be tagged as
+ a medical history.
+
+ To curb that possibility, using the output of the `eds.sections` component is
+ deactivated by default.
+
+ - the [`eds.dates` component](/pipelines/misc/dates). In that case, it will take the
+ dates into account to tag extracted entities as a medical history or not.
+
+ !!! info "Dates"
+
+ To take the most of the `eds.dates` component, you may add the ``note_datetime``
+ context (cf. [Adding context][using-eds-nlps-helper-functions]). It allows the
+ component to compute the duration of absolute dates
+ (e.g., le 28 août 2022/August 28, 2022). The ``birth_datetime`` context allows
+ the component to exclude the birthdate from the extracted dates.
+
+ Examples
+ --------
+ The following snippet matches a simple terminology, and checks whether the extracted
+ entities are history or not. It is complete and can be run _as is_.
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ nlp.add_pipe("eds.normalizer")
+ nlp.add_pipe("eds.sections")
+ nlp.add_pipe("eds.dates")
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(terms=dict(douleur="douleur", malaise="malaises")),
+ )
+ nlp.add_pipe(
+ "eds.history",
+ config=dict(
+ use_sections=True,
+ use_dates=True,
+ ),
+ )
+
+ text = (
+ "Le patient est admis le 23 août 2021 pour une douleur au bras. "
+ "Il a des antécédents de malaises."
+ "ANTÉCÉDENTS : "
+ "- le patient a déjà eu des malaises. "
+ "- le patient a eu une douleur à la jambe il y a 10 jours"
+ )
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (douleur, malaises, malaises, douleur)
+
+ doc.ents[0]._.history
+ # Out: False
+
+ doc.ents[1]._.history
+ # Out: True
+
+ doc.ents[2]._.history # (1)
+ # Out: True
+
+ doc.ents[3]._.history # (2)
+ # Out: False
+ ```
+
+ 1. The entity is in the section `antécédent`.
+ 2. The entity is in the section `antécédent`, however the extracted `relative_date`
+ refers to an event that took place within 14 days.
+
+ Extensions
+ ----------
+ The `eds.history` component declares two extensions, on both `Span` and `Token`
+ objects :
+
+ 1. The `history` attribute is a boolean, set to `True` if the component predicts
+ that the span/token is a medical history.
+ 2. The `history_` property is a human-readable string, computed from the `history`
+ attribute. It implements a simple getter function that outputs `CURRENT` or
+ `ATCD`, depending on the value of `history`.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
+ The pipeline object.
+ name : Optional[str]
+ The component name.
history : Optional[List[str]]
List of terms indicating medical history reference.
+ termination : Optional[List[str]]
+ List of syntagms termination terms.
use_sections : bool
Whether to use section pipeline to detect medical history section.
use_dates : bool
Whether to use dates pipeline to detect if the event occurs
a long time before the document date.
- history_limit : int
- The number of days after which the event is considered as history.
- exclude_birthdate : bool
- Whether to exclude the birth date from history dates.
- closest_dates_only : bool
- Whether to include the closest dates only.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
we can also add a key for each regex.
+ history_limit : Union[int, timedelta]
+ The number of days after which the event is considered as history.
+ exclude_birthdate : bool
+ Whether to exclude the birthdate from history dates.
+ closest_dates_only : bool
+ Whether to include the closest dates only.
+ span_getter : SpanGetterArg
+ Where to look for dates in the doc. By default, look in the whole doc. You can
+ combine this with the `merge_mode` argument for interesting results.
on_ents_only : Union[bool, str, List[str], Set[str]]
+ Deprecated, use `span_getter` instead.
+
Whether to look for matches around detected entities only.
Useful for faster inference in downstream tasks.
+
- If True, will look in all ents located in `doc.ents` only
- If an iterable of string is passed, will additionally look in `doc.spans[key]`
for each key in the iterable
explain : bool
Whether to keep track of cues for each entity.
+
+ Authors and citation
+ --------------------
+ The `eds.history` component was developed by AP-HP's Data Science team.
"""
- defaults = dict(
- history=history,
- termination=termination,
- )
+ history_limit: timedelta
def __init__(
self,
nlp: Language,
- attr: str,
- history: Optional[List[str]],
- termination: Optional[List[str]],
- use_sections: bool,
- use_dates: bool,
- history_limit: int,
- closest_dates_only: bool,
- exclude_birthdate: bool,
- explain: bool,
- on_ents_only: Union[bool, str, List[str], Set[str]],
+ name: Optional[str] = "eds.history",
+ *,
+ history: Optional[List[str]] = None,
+ termination: Optional[List[str]] = None,
+ use_sections: bool = False,
+ use_dates: bool = False,
+ attr: str = "NORM",
+ history_limit: int = 14,
+ closest_dates_only: bool = True,
+ exclude_birthdate: bool = True,
+ span_getter: SpanGetterArg = None,
+ on_ents_only: Union[bool, str, List[str], Set[str]] = True,
+ explain: bool = False,
):
- terms = self.get_defaults(
- history=history,
- termination=termination,
+ terms = dict(
+ history=patterns.history if history is None else history,
+ termination=default_termination if termination is None else termination,
)
super().__init__(
nlp=nlp,
+ name=name,
attr=attr,
- on_ents_only=on_ents_only,
explain=explain,
- **terms,
+ terms=terms,
+ on_ents_only=on_ents_only,
+ span_getter=span_getter,
)
- self.set_extensions()
-
self.history_limit = timedelta(history_limit)
self.exclude_birthdate = exclude_birthdate
self.closest_dates_only = closest_dates_only
@@ -127,74 +239,41 @@ def __init__(
"context. "
)
- @classmethod
- def set_extensions(cls) -> None:
-
- if not Token.has_extension("history"):
- Token.set_extension("history", default=False)
-
- if not Token.has_extension("antecedents"):
- Token.set_extension(
- "antecedents",
- getter=deprecated_getter_factory("antecedents", "history"),
- )
-
- if not Token.has_extension("antecedent"):
- Token.set_extension(
- "antecedent",
- getter=deprecated_getter_factory("antecedent", "history"),
- )
-
- if not Token.has_extension("history_"):
- Token.set_extension(
- "history_",
- getter=lambda token: "ATCD" if token._.history else "CURRENT",
- )
-
- if not Token.has_extension("antecedents_"):
- Token.set_extension(
- "antecedents_",
- getter=deprecated_getter_factory("antecedents_", "history_"),
- )
+ def set_extensions(self) -> None:
+ for cls in (Token, Span):
+ if not cls.has_extension("history"):
+ cls.set_extension("history", default=False)
- if not Token.has_extension("antecedent_"):
- Token.set_extension(
- "antecedent_",
- getter=deprecated_getter_factory("antecedent_", "history_"),
- )
-
- if not Span.has_extension("history"):
- Span.set_extension("history", default=False)
+ if not cls.has_extension("antecedents"):
+ cls.set_extension(
+ "antecedents",
+ getter=deprecated_getter_factory("antecedents", "history"),
+ )
- if not Span.has_extension("antecedents"):
- Span.set_extension(
- "antecedents",
- getter=deprecated_getter_factory("antecedents", "history"),
- )
+ if not cls.has_extension("antecedent"):
+ cls.set_extension(
+ "antecedent",
+ getter=deprecated_getter_factory("antecedent", "history"),
+ )
- if not Span.has_extension("antecedent"):
- Span.set_extension(
- "antecedent",
- getter=deprecated_getter_factory("antecedent", "history"),
- )
+ if not cls.has_extension("history_"):
+ cls.set_extension(
+ "history_",
+ getter=lambda token: "ATCD" if token._.history else "CURRENT",
+ )
- if not Span.has_extension("history_"):
- Span.set_extension(
- "history_",
- getter=lambda span: "ATCD" if span._.history else "CURRENT",
- )
+ if not cls.has_extension("antecedents_"):
+ cls.set_extension(
+ "antecedents_",
+ getter=deprecated_getter_factory("antecedents_", "history_"),
+ )
- if not Span.has_extension("antecedents_"):
- Span.set_extension(
- "antecedents_",
- getter=deprecated_getter_factory("antecedents_", "history_"),
- )
+ if not cls.has_extension("antecedent_"):
+ cls.set_extension(
+ "antecedent_",
+ getter=deprecated_getter_factory("antecedent_", "history_"),
+ )
- if not Span.has_extension("antecedent_"):
- Span.set_extension(
- "antecedent_",
- getter=deprecated_getter_factory("antecedent_", "history_"),
- )
# Store history mentions responsible for the history entity's character
if not Span.has_extension("history_cues"):
Span.set_extension("history_cues", default=[])
@@ -216,20 +295,7 @@ def set_extensions(cls) -> None:
)
def process(self, doc: Doc) -> Doc:
- """
- Finds entities related to history.
-
- Parameters
- ----------
- doc:
- spaCy Doc object
-
- Returns
- -------
- doc:
- spaCy Doc object, annotated for history
- """
-
+ note_datetime = None
if doc._.note_datetime is not None:
try:
note_datetime = pendulum.instance(doc._.note_datetime)
@@ -241,6 +307,7 @@ def process(self, doc: Doc) -> Doc:
)
note_datetime = None
+ birth_datetime = None
if doc._.birth_datetime is not None:
try:
birth_datetime = pendulum.instance(doc._.birth_datetime)
@@ -254,13 +321,13 @@ def process(self, doc: Doc) -> Doc:
matches = self.get_matches(doc)
- terminations = get_spans(matches, "termination")
+ terminations = [m for m in matches if m.label_ == "termination"]
boundaries = self._boundaries(doc, terminations)
# Removes duplicate matches and pseudo-expressions in one statement
matches = filter_spans(matches, label_to_remove="pseudo")
- entities = self.get_spans(doc)
+ entities = list(get_spans(doc, self.span_getter))
ents = None
sub_sections = None
sub_recent_dates = None
@@ -278,28 +345,30 @@ def process(self, doc: Doc) -> Doc:
recent_dates = []
if self.dates:
for date in doc.spans["dates"]:
- if date.label_ == "relative":
- if date._.date.direction.value == "CURRENT":
+ value = date._.date
+ if value.mode == "relative":
+ if value.direction.value == "current":
if (
- (
- date._.date.year == 0
- and self.history_limit >= timedelta(365)
- )
+ (value.year == 0 and self.history_limit >= timedelta(365))
or (
- date._.date.month == 0
- and self.history_limit >= timedelta(30)
+ value.month == 0 and self.history_limit >= timedelta(30)
)
- or (
- date._.date.week == 0
- and self.history_limit >= timedelta(7)
- )
- or (date._.date.day == 0)
+ or (value.week == 0 and self.history_limit >= timedelta(7))
+ or (value.day == 0)
):
recent_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
- elif date._.date.direction.value == "PAST":
- if -date._.date.to_datetime() >= self.history_limit:
+ elif value.direction.value == "past":
+ if (
+ -value.to_duration(
+ note_datetime=doc._.note_datetime,
+ infer_from_context=True,
+ tz="Europe/Paris",
+ default_day=15,
+ )
+ >= self.history_limit
+ ):
history_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
@@ -307,9 +376,9 @@ def process(self, doc: Doc) -> Doc:
recent_dates.append(
Span(doc, date.start, date.end, label="relative_date")
)
- elif date.label_ == "absolute" and doc._.note_datetime:
+ elif value.mode == "absolute" and doc._.note_datetime:
try:
- absolute_date = date._.date.to_datetime(
+ absolute_date = value.to_datetime(
note_datetime=note_datetime,
infer_from_context=True,
tz="Europe/Paris",
@@ -321,7 +390,7 @@ def process(self, doc: Doc) -> Doc:
"In doc {}, the following date {} raises this error: {}. "
"Skipping this date.",
doc._.note_id,
- date._.date,
+ value,
e,
)
if absolute_date:
@@ -353,6 +422,9 @@ def process(self, doc: Doc) -> Doc:
sub_sections, sections = consume_spans(
sections, lambda s: s.start < end <= s.end, sub_sections
)
+
+ close_recent_dates = []
+ close_history_dates = []
if self.dates:
sub_recent_dates, recent_dates = consume_spans(
recent_dates,
@@ -367,8 +439,6 @@ def process(self, doc: Doc) -> Doc:
# Filter dates inside the boundaries only
if self.closest_dates_only:
- close_recent_dates = []
- close_history_dates = []
if sub_recent_dates:
close_recent_dates = [
recent_date
@@ -434,7 +504,7 @@ def process(self, doc: Doc) -> Doc:
if self.on_ents_only and not ents:
continue
- history_cues = get_spans(sub_matches, "history")
+ history_cues = [m for m in sub_matches if m.label_ == "history"]
recent_cues = []
if self.sections:
@@ -454,7 +524,7 @@ def process(self, doc: Doc) -> Doc:
if not self.on_ents_only:
for token in doc[start:end]:
- token._.history = history
+ token._.history = token._.history or history
for ent in ents:
ent._.history = ent._.history or history
@@ -462,7 +532,6 @@ def process(self, doc: Doc) -> Doc:
if self.explain:
ent._.history_cues += history_cues
ent._.recent_cues += recent_cues
-
if not self.on_ents_only and ent._.history:
for token in ent:
token._.history = True
diff --git a/edsnlp/pipelines/qualifiers/hypothesis/__init__.py b/edsnlp/pipelines/qualifiers/hypothesis/__init__.py
index 6bd5c896e..ecee4a46f 100644
--- a/edsnlp/pipelines/qualifiers/hypothesis/__init__.py
+++ b/edsnlp/pipelines/qualifiers/hypothesis/__init__.py
@@ -1,2 +1,3 @@
-from .hypothesis import Hypothesis
-from .patterns import confirmation, following, preceding, pseudo, verbs_eds, verbs_hyp
+from .hypothesis import HypothesisQualifier
+
+Hypothesis = HypothesisQualifier
diff --git a/edsnlp/pipelines/qualifiers/hypothesis/factory.py b/edsnlp/pipelines/qualifiers/hypothesis/factory.py
index 873cb8b6f..7af528246 100644
--- a/edsnlp/pipelines/qualifiers/hypothesis/factory.py
+++ b/edsnlp/pipelines/qualifiers/hypothesis/factory.py
@@ -1,59 +1,29 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
-from edsnlp.pipelines.qualifiers.hypothesis import Hypothesis
from edsnlp.utils.deprecation import deprecated_factory
+from .hypothesis import HypothesisQualifier
+
DEFAULT_CONFIG = dict(
pseudo=None,
preceding=None,
following=None,
- termination=None,
- verbs_hyp=None,
verbs_eds=None,
+ verbs_hyp=None,
+ termination=None,
attr="NORM",
+ span_getter=None,
on_ents_only=True,
within_ents=False,
explain=False,
)
-
-@deprecated_factory(
+create_component = deprecated_factory(
"hypothesis",
"eds.hypothesis",
- default_config=DEFAULT_CONFIG,
assigns=["span._.hypothesis"],
-)
-@Language.factory(
+)(HypothesisQualifier)
+create_component = Language.factory(
"eds.hypothesis",
- default_config=DEFAULT_CONFIG,
assigns=["span._.hypothesis"],
-)
-def create_component(
- nlp: Language,
- name: str,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- termination: Optional[List[str]],
- verbs_eds: Optional[List[str]],
- verbs_hyp: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
-):
- return Hypothesis(
- nlp=nlp,
- attr=attr,
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- termination=termination,
- verbs_eds=verbs_eds,
- verbs_hyp=verbs_hyp,
- on_ents_only=on_ents_only,
- within_ents=within_ents,
- explain=explain,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
index c5fa4d57f..26cb90a6c 100644
--- a/edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
+++ b/edsnlp/pipelines/qualifiers/hypothesis/hypothesis.py
@@ -3,18 +3,20 @@
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
-from edsnlp.pipelines.qualifiers.base import Qualifier
-from edsnlp.pipelines.terminations import termination
-from edsnlp.utils.filter import consume_spans, filter_spans, get_spans
+from edsnlp.pipelines.base import SpanGetterArg, get_spans
+from edsnlp.pipelines.qualifiers.base import RuleBasedQualifier
+from edsnlp.pipelines.terminations import termination as default_termination
+from edsnlp.utils.filter import consume_spans, filter_spans
from edsnlp.utils.inclusion import check_inclusion
from edsnlp.utils.resources import get_verbs
-from .patterns import following, preceding, pseudo, verbs_eds, verbs_hyp
+from . import patterns
-class Hypothesis(Qualifier):
+class HypothesisQualifier(RuleBasedQualifier):
"""
- Hypothesis detection with spaCy.
+ The `eds.hypothesis` pipeline uses a simple rule-based algorithm to detect spans
+ that are speculations rather than certain statements.
The component looks for five kinds of expressions in the text :
@@ -25,10 +27,79 @@ class Hypothesis(Qualifier):
- hypothetical verbs : verbs indicating hypothesis (eg "douter")
- classic verbs conjugated to the conditional, thus indicating hypothesis
+ Examples
+ --------
+ The following snippet matches a simple terminology, and checks whether the extracted
+ entities are part of a speculation. It is complete and can be run _as is_.
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ # Dummy matcher
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(terms=dict(douleur="douleur", fracture="fracture")),
+ )
+ nlp.add_pipe("eds.hypothesis")
+
+ text = (
+ "Le patient est admis le 23 août 2021 pour une douleur au bras. "
+ "Possible fracture du radius."
+ )
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (douleur, fracture)
+
+ doc.ents[0]._.hypothesis
+ # Out: False
+
+ doc.ents[1]._.hypothesis
+ # Out: True
+ ```
+
+ Extensions
+ ----------
+ The `eds.hypothesis` component declares two extensions, on both `Span` and `Token`
+ objects :
+
+ 1. The `hypothesis` attribute is a boolean, set to `True` if the component predicts
+ that the span/token is a speculation.
+ 2. The `hypothesis_` property is a human-readable string, computed from the
+ `hypothesis` attribute. It implements a simple getter function that outputs
+ `HYP` or `CERT`, depending on the value of `hypothesis`.
+
+ Performance
+ ------------
+ The component's performance is measured on three datasets :
+
+ - The ESSAI ([@dalloux2017ESSAI]) and CAS ([@grabar2018CAS]) datasets were developed
+ at the CNRS. The two are concatenated.
+ - The NegParHyp corpus was specifically developed at APHP's CDW to test the
+ component on actual clinical notes, using pseudonymised notes from the APHP's CDW.
+
+ | Dataset | Hypothesis F1 |
+ | --------- | ------------- |
+ | CAS/ESSAI | 49% |
+ | NegParHyp | 52% |
+
+ !!! note "NegParHyp corpus"
+
+ The NegParHyp corpus was built by matching a subset of the MeSH terminology with
+ around 300 documents from AP-HP's clinical data warehouse. Matched entities were
+ then labelled for negation, speculation and family context.
+
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
+ The pipeline object.
+ name : Optional[str]
+ The component name.
+ attr : str
+ spaCy's attribute to use
pseudo : Optional[List[str]]
List of pseudo hypothesis cues.
preceding : Optional[List[str]]
@@ -39,11 +110,15 @@ class Hypothesis(Qualifier):
List of hypothetical verbs.
verbs_eds : Optional[List[str]]
List of mainstream verbs.
+ termination : Optional[List[str]]
+ List of termination terms.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with the key 'term_attr'
we can also add a key for each regex.
on_ents_only : Union[bool, str, List[str], Set[str]]
+ Deprecated, use `span_getter` instead.
+
Whether to look for matches around detected entities only.
Useful for faster inference in downstream tasks.
@@ -54,39 +129,36 @@ class Hypothesis(Qualifier):
Whether to consider cues within entities.
explain : bool
Whether to keep track of cues for each entity.
- """
- defaults = dict(
- following=following,
- preceding=preceding,
- pseudo=pseudo,
- termination=termination,
- verbs_eds=verbs_eds,
- verbs_hyp=verbs_hyp,
- )
+ Authors and citation
+ --------------------
+ The `eds.hypothesis` pipeline was developed by AP-HP's Data Science team.
+ """
def __init__(
self,
nlp: Language,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- termination: Optional[List[str]],
- verbs_eds: Optional[List[str]],
- verbs_hyp: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
+ name: Optional[str] = "eds.hypothesis",
+ *,
+ pseudo: Optional[List[str]] = None,
+ preceding: Optional[List[str]] = None,
+ following: Optional[List[str]] = None,
+ verbs_eds: Optional[List[str]] = None,
+ verbs_hyp: Optional[List[str]] = None,
+ termination: Optional[List[str]] = None,
+ attr: str = "NORM",
+ span_getter: SpanGetterArg = None,
+ on_ents_only: Union[bool, str, List[str], Set[str]] = True,
+ within_ents: bool = False,
+ explain: bool = False,
):
-
- terms = self.get_defaults(
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- termination=termination,
- verbs_eds=verbs_eds,
- verbs_hyp=verbs_hyp,
+ terms = dict(
+ pseudo=patterns.pseudo if pseudo is None else pseudo,
+ preceding=patterns.preceding if preceding is None else preceding,
+ following=patterns.following if following is None else following,
+ termination=default_termination if termination is None else termination,
+ verbs_eds=patterns.verbs_eds if verbs_eds is None else verbs_eds,
+ verbs_hyp=patterns.verbs_hyp if verbs_hyp is None else verbs_hyp,
)
terms["verbs_preceding"], terms["verbs_following"] = self.load_verbs(
verbs_hyp=terms.pop("verbs_hyp"),
@@ -95,34 +167,28 @@ def __init__(
super().__init__(
nlp=nlp,
+ name=name,
attr=attr,
- on_ents_only=on_ents_only,
explain=explain,
- **terms,
+ terms=terms,
+ on_ents_only=on_ents_only,
+ span_getter=span_getter,
)
self.within_ents = within_ents
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
- if not Token.has_extension("hypothesis"):
- Token.set_extension("hypothesis", default=False)
-
- if not Token.has_extension("hypothesis_"):
- Token.set_extension(
- "hypothesis_",
- getter=lambda token: "HYP" if token._.hypothesis else "CERT",
- )
+ def set_extensions(self) -> None:
+ super().set_extensions()
+ for cls in (Token, Span):
+ if not cls.has_extension("hypothesis"):
+ cls.set_extension("hypothesis", default=False)
- if not Span.has_extension("hypothesis"):
- Span.set_extension("hypothesis", default=False)
-
- if not Span.has_extension("hypothesis_"):
- Span.set_extension(
- "hypothesis_",
- getter=lambda span: "HYP" if span._.hypothesis else "CERT",
- )
+ if not cls.has_extension("hypothesis_"):
+ cls.set_extension(
+ "hypothesis_",
+ getter=lambda token: "HYP" if token._.hypothesis else "CERT",
+ )
if not Span.has_extension("hypothesis_cues"):
Span.set_extension("hypothesis_cues", default=[])
@@ -166,27 +232,15 @@ def load_verbs(
)
def process(self, doc: Doc) -> Doc:
- """
- Finds entities related to hypothesis.
-
- Parameters
- ----------
- doc: spaCy Doc object
-
- Returns
- -------
- doc: spaCy Doc object, annotated for hypothesis
- """
-
matches = self.get_matches(doc)
- terminations = get_spans(matches, "termination")
+ terminations = [m for m in matches if m.label_ == "termination"]
boundaries = self._boundaries(doc, terminations)
# Removes duplicate matches and pseudo-expressions in one statement
matches = filter_spans(matches, label_to_remove="pseudo")
- entities = list(self.get_spans(doc))
+ entities = list(get_spans(doc, self.span_getter))
ents = None
for start, end in boundaries:
@@ -204,10 +258,12 @@ def process(self, doc: Doc) -> Doc:
if self.on_ents_only and not ents:
continue
- sub_preceding = get_spans(sub_matches, "preceding")
- sub_following = get_spans(sub_matches, "following")
- sub_preceding += get_spans(sub_matches, "verbs_preceding")
- sub_following += get_spans(sub_matches, "verbs_following")
+ sub_preceding = [m for m in sub_matches if m.label_ == "preceding"]
+ sub_following = [m for m in sub_matches if m.label_ == "following"]
+ # Verbs preceding negated content
+ sub_preceding += [m for m in sub_matches if m.label_ == "verbs_preceding"]
+ # Verbs following negated content
+ sub_following += [m for m in sub_matches if m.label_ == "verbs_following"]
if not sub_preceding + sub_following:
continue
@@ -219,7 +275,6 @@ def process(self, doc: Doc) -> Doc:
) or any(m.start > token.i for m in sub_following)
for ent in ents:
-
if self.within_ents:
cues = [m for m in sub_preceding if m.end <= ent.end]
cues += [m for m in sub_following if m.start >= ent.start]
diff --git a/edsnlp/pipelines/qualifiers/negation/__init__.py b/edsnlp/pipelines/qualifiers/negation/__init__.py
index abf11d815..6e7e6e93f 100644
--- a/edsnlp/pipelines/qualifiers/negation/__init__.py
+++ b/edsnlp/pipelines/qualifiers/negation/__init__.py
@@ -1 +1,3 @@
-from .negation import Negation
+from .negation import NegationQualifier
+
+Negation = NegationQualifier
diff --git a/edsnlp/pipelines/qualifiers/negation/factory.py b/edsnlp/pipelines/qualifiers/negation/factory.py
index f0ece500f..f30490374 100644
--- a/edsnlp/pipelines/qualifiers/negation/factory.py
+++ b/edsnlp/pipelines/qualifiers/negation/factory.py
@@ -1,57 +1,28 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
-from edsnlp.pipelines.qualifiers.negation import Negation
from edsnlp.utils.deprecation import deprecated_factory
+from .negation import NegationQualifier
+
DEFAULT_CONFIG = dict(
pseudo=None,
preceding=None,
following=None,
- termination=None,
verbs=None,
+ termination=None,
attr="NORM",
+ span_getter=None,
on_ents_only=True,
within_ents=False,
explain=False,
)
-
-@deprecated_factory(
+create_component = deprecated_factory(
"negation",
"eds.negation",
- default_config=DEFAULT_CONFIG,
assigns=["span._.negation"],
-)
-@Language.factory(
+)(NegationQualifier)
+create_component = Language.factory(
"eds.negation",
- default_config=DEFAULT_CONFIG,
assigns=["span._.negation"],
-)
-def create_component(
- nlp: Language,
- name: str,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- termination: Optional[List[str]],
- verbs: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
-):
-
- return Negation(
- nlp=nlp,
- attr=attr,
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- termination=termination,
- verbs=verbs,
- on_ents_only=on_ents_only,
- within_ents=within_ents,
- explain=explain,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/qualifiers/negation/negation.py b/edsnlp/pipelines/qualifiers/negation/negation.py
index bdc4004bf..2ad21790c 100644
--- a/edsnlp/pipelines/qualifiers/negation/negation.py
+++ b/edsnlp/pipelines/qualifiers/negation/negation.py
@@ -3,51 +3,124 @@
from spacy.language import Language
from spacy.tokens import Doc, Span, Token
-from edsnlp.pipelines.qualifiers.base import Qualifier
-from edsnlp.pipelines.terminations import termination
+from edsnlp.pipelines.base import SpanGetterArg, get_spans
+from edsnlp.pipelines.qualifiers.base import RuleBasedQualifier
+from edsnlp.pipelines.terminations import termination as default_termination
from edsnlp.utils.deprecation import deprecated_getter_factory
-from edsnlp.utils.filter import consume_spans, filter_spans, get_spans
+from edsnlp.utils.filter import consume_spans, filter_spans
from edsnlp.utils.inclusion import check_inclusion
from edsnlp.utils.resources import get_verbs
-from .patterns import following, preceding, pseudo, verbs
+from . import patterns
-class Negation(Qualifier):
+class NegationQualifier(RuleBasedQualifier):
"""
- Implements the NegEx algorithm.
+ The `eds.negation` component uses a simple rule-based algorithm to detect negated
+ spans. It was designed at AP-HP's EDS, following the insights of the NegEx algorithm
+ by [@chapman_simple_2001].
The component looks for five kinds of expressions in the text :
- - preceding negations, ie cues that precede a negated expression
-
- - following negations, ie cues that follow a negated expression
-
+ - preceding negations, i.e., cues that precede a negated expression
+ - following negations, i.e., cues that follow a negated expression
- pseudo negations : contain a negation cue, but are not negations
(eg "pas de doute"/"no doubt")
+ - negation verbs, i.e., verbs that indicate a negation
+ - terminations, i.e., words that delimit propositions.
+ The negation spans from the preceding cue to the termination.
- - negation verbs, ie verbs that indicate a negation
+ Examples
+ --------
+ The following snippet matches a simple terminology, and checks the polarity of the
+ extracted entities. It is complete and can be run _as is_.
- - terminations, ie words that delimit propositions.
- The negation spans from the preceding cue to the termination.
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ # Dummy matcher
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(terms=dict(patient="patient", fracture="fracture")),
+ )
+ nlp.add_pipe("eds.negation")
+
+ text = (
+ "Le patient est admis le 23 août 2021 pour une douleur au bras. "
+ "Le scanner ne détecte aucune fracture."
+ )
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (patient, fracture)
+
+ doc.ents[0]._.negation # (1)
+ # Out: False
+
+ doc.ents[1]._.negation
+ # Out: True
+ ```
+
+ 1. The result of the component is kept in the `negation` custom extension.
+
+ Extensions
+ ----------
+ The `eds.negation` component declares two extensions, on both `Span` and `Token`
+ objects :
+
+ 1. The `negation` attribute is a boolean, set to `True` if the component predicts
+ that the span/token is negated.
+ 2. The `negation_` property is a human-readable string, computed from the `negation`
+ attribute. It implements a simple getter function that outputs `AFF` or `NEG`,
+ depending on the value of `negation`.
+
+ Performance
+ -----------
+ The component's performance is measured on three datasets :
+
+ - The ESSAI ([@dalloux2017ESSAI]) and CAS ([@grabar2018CAS]) datasets were developed
+ at the CNRS. The two are concatenated.
+ - The NegParHyp corpus was specifically developed at AP-HP to test the component
+ on actual clinical notes, using pseudonymised notes from the AP-HP.
+
+ | Dataset | Negation F1 |
+ |-----------|-------------|
+ | CAS/ESSAI | 71% |
+ | NegParHyp | 88% |
+
+ !!! note "NegParHyp corpus"
+
+ The NegParHyp corpus was built by matching a subset of the MeSH terminology with
+ around 300 documents from AP-HP's clinical data warehouse. Matched entities were
+ then labelled for negation, speculation and family context.
Parameters
----------
nlp : Language
- spaCy nlp pipeline to use for matching.
+ The pipeline object.
+ name : Optional[str]
+ The component name.
attr : str
spaCy's attribute to use
pseudo : Optional[List[str]]
- List of pseudo negation terms.
+ List of pseudo negation cues.
preceding : Optional[List[str]]
- List of preceding negation terms
+ List of preceding negation cues
following : Optional[List[str]]
- List of following negation terms.
- termination : Optional[List[str]]
- List of termination terms.
+ List of following negation cues.
verbs : Optional[List[str]]
List of negation verbs.
+ termination : Optional[List[str]]
+ List of termination terms.
+ span_getter : SpanGetterArg
+ Where to look for dates in the doc. By default, look in the whole doc. You can
+ combine this with the `merge_mode` argument for interesting results.
on_ents_only : Union[bool, str, List[str], Set[str]]
+ Deprecated, use `span_getter` instead.
+
Whether to look for matches around detected entities only.
Useful for faster inference in downstream tasks.
@@ -58,36 +131,34 @@ class Negation(Qualifier):
Whether to consider cues within entities.
explain : bool
Whether to keep track of cues for each entity.
- """
- defaults = dict(
- following=following,
- preceding=preceding,
- pseudo=pseudo,
- verbs=verbs,
- termination=termination,
- )
+ Authors and citation
+ --------------------
+ The `eds.negation` component was developed by AP-HP's Data Science team.
+ """
def __init__(
self,
nlp: Language,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- termination: Optional[List[str]],
- verbs: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
+ name: Optional[str] = "eds.negation",
+ *,
+ pseudo: Optional[List[str]] = None,
+ preceding: Optional[List[str]] = None,
+ following: Optional[List[str]] = None,
+ verbs: Optional[List[str]] = None,
+ termination: Optional[List[str]] = None,
+ attr: str = "NORM",
+ span_getter: SpanGetterArg = None,
+ on_ents_only: Union[bool, str, List[str], Set[str]] = True,
+ within_ents: bool = False,
+ explain: bool = False,
):
-
- terms = self.get_defaults(
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- termination=termination,
- verbs=verbs,
+ terms = dict(
+ pseudo=patterns.pseudo if pseudo is None else pseudo,
+ preceding=patterns.preceding if preceding is None else preceding,
+ following=patterns.following if following is None else following,
+ termination=default_termination if termination is None else termination,
+ verbs=patterns.verbs if verbs is None else verbs,
)
terms["verbs_preceding"], terms["verbs_following"] = self.load_verbs(
terms["verbs"]
@@ -95,64 +166,43 @@ def __init__(
super().__init__(
nlp=nlp,
+ name=name,
attr=attr,
- on_ents_only=on_ents_only,
explain=explain,
- **terms,
+ terms=terms,
+ on_ents_only=on_ents_only,
+ span_getter=span_getter,
)
self.within_ents = within_ents
self.set_extensions()
- @classmethod
- def set_extensions(cl) -> None:
-
- if not Token.has_extension("negation"):
- Token.set_extension("negation", default=False)
-
- if not Token.has_extension("negated"):
- Token.set_extension(
- "negated", getter=deprecated_getter_factory("negated", "negation")
- )
-
- if not Token.has_extension("negation_"):
- Token.set_extension(
- "negation_",
- getter=lambda token: "NEG" if token._.negation else "AFF",
- )
+ def set_extensions(self) -> None:
+ super().set_extensions()
+ for cls in (Token, Span):
+ if not cls.has_extension("negation"):
+ cls.set_extension("negation", default=False)
- if not Token.has_extension("polarity_"):
- Token.set_extension(
- "polarity_",
- getter=deprecated_getter_factory("polarity_", "negation_"),
- )
+ if not cls.has_extension("negated"):
+ cls.set_extension(
+ "negated", getter=deprecated_getter_factory("negated", "negation")
+ )
- if not Span.has_extension("negation"):
- Span.set_extension("negation", default=False)
+ if not cls.has_extension("negation_"):
+ cls.set_extension(
+ "negation_",
+ getter=lambda token: "NEG" if token._.negation else "AFF",
+ )
- if not Span.has_extension("negated"):
- Span.set_extension(
- "negated", getter=deprecated_getter_factory("negated", "negation")
- )
+ if not cls.has_extension("polarity_"):
+ cls.set_extension(
+ "polarity_",
+ getter=deprecated_getter_factory("polarity_", "negation_"),
+ )
if not Span.has_extension("negation_cues"):
Span.set_extension("negation_cues", default=[])
- if not Span.has_extension("negation_"):
- Span.set_extension(
- "negation_",
- getter=lambda span: "NEG" if span._.negation else "AFF",
- )
-
- if not Span.has_extension("polarity_"):
- Span.set_extension(
- "polarity_",
- getter=deprecated_getter_factory("polarity_", "negation_"),
- )
-
- if not Doc.has_extension("negations"):
- Doc.set_extension("negations", default=[])
-
def load_verbs(self, verbs: List[str]) -> List[str]:
"""
Conjugate negating verbs to specific tenses.
@@ -179,66 +229,18 @@ def load_verbs(self, verbs: List[str]) -> List[str]:
list_neg_verbs_preceding = list(neg_verbs_preceding["term"].unique())
list_neg_verbs_following = list(neg_verbs_following["term"].unique())
- return (list_neg_verbs_preceding, list_neg_verbs_following)
-
- def annotate_entity(
- self,
- ent: Span,
- sub_preceding: List[Span],
- sub_following: List[Span],
- ) -> None:
- """
- Annotate entities using preceding and following negations.
-
- Parameters
- ----------
- ent : Span
- Entity to annotate
- sub_preceding : List[Span]
- List of preceding negations cues
- sub_following : List[Span]
- List of following negations cues
- """
- if self.within_ents:
- cues = [m for m in sub_preceding if m.end <= ent.end]
- cues += [m for m in sub_following if m.start >= ent.start]
- else:
- cues = [m for m in sub_preceding if m.end <= ent.start]
- cues += [m for m in sub_following if m.start >= ent.end]
-
- negation = ent._.negation or bool(cues)
-
- ent._.negation = negation
-
- if self.explain and negation:
- ent._.negation_cues += cues
-
- if not self.on_ents_only and negation:
- for token in ent:
- token._.negation = True
+ return list_neg_verbs_preceding, list_neg_verbs_following
def process(self, doc: Doc) -> Doc:
- """
- Finds entities related to negation.
-
- Parameters
- ----------
- doc: spaCy `Doc` object
-
- Returns
- -------
- doc: spaCy `Doc` object, annotated for negation
- """
-
matches = self.get_matches(doc)
- terminations = get_spans(matches, "termination")
+ terminations = [m for m in matches if m.label_ == "termination"]
boundaries = self._boundaries(doc, terminations)
# Removes duplicate matches and pseudo-expressions in one statement
matches = filter_spans(matches, label_to_remove="pseudo")
- entities = list(self.get_spans(doc))
+ entities = list(get_spans(doc, self.span_getter))
ents = None
for start, end in boundaries:
@@ -256,12 +258,12 @@ def process(self, doc: Doc) -> Doc:
if self.on_ents_only and not ents:
continue
- sub_preceding = get_spans(sub_matches, "preceding")
- sub_following = get_spans(sub_matches, "following")
+ sub_preceding = [m for m in sub_matches if m.label_ == "preceding"]
+ sub_following = [m for m in sub_matches if m.label_ == "following"]
# Verbs preceding negated content
- sub_preceding += get_spans(sub_matches, "verbs_preceding")
+ sub_preceding += [m for m in sub_matches if m.label_ == "verbs_preceding"]
# Verbs following negated content
- sub_following += get_spans(sub_matches, "verbs_following")
+ sub_following += [m for m in sub_matches if m.label_ == "verbs_following"]
if not sub_preceding + sub_following:
continue
@@ -273,13 +275,22 @@ def process(self, doc: Doc) -> Doc:
) or any(m.start > token.i for m in sub_following)
for ent in ents:
- self.annotate_entity(
- ent=ent,
- sub_preceding=sub_preceding,
- sub_following=sub_following,
- )
+ if self.within_ents:
+ cues = [m for m in sub_preceding if m.end <= ent.end]
+ cues += [m for m in sub_following if m.start >= ent.start]
+ else:
+ cues = [m for m in sub_preceding if m.end <= ent.start]
+ cues += [m for m in sub_following if m.start >= ent.end]
- return doc
+ negation = ent._.negation or bool(cues)
- def __call__(self, doc: Doc) -> Doc:
- return self.process(doc)
+ ent._.negation = negation
+
+ if self.explain and negation:
+ ent._.negation_cues += cues
+
+ if not self.on_ents_only and negation:
+ for token in ent:
+ token._.negation = True
+
+ return doc
diff --git a/edsnlp/pipelines/qualifiers/reported_speech/__init__.py b/edsnlp/pipelines/qualifiers/reported_speech/__init__.py
index 00de62527..b109fd91c 100644
--- a/edsnlp/pipelines/qualifiers/reported_speech/__init__.py
+++ b/edsnlp/pipelines/qualifiers/reported_speech/__init__.py
@@ -1 +1,3 @@
-from .reported_speech import ReportedSpeech
+from .reported_speech import ReportedSpeechQualifier
+
+ReportedSpeech = ReportedSpeechQualifier
diff --git a/edsnlp/pipelines/qualifiers/reported_speech/factory.py b/edsnlp/pipelines/qualifiers/reported_speech/factory.py
index 842a474c0..e8f782b48 100644
--- a/edsnlp/pipelines/qualifiers/reported_speech/factory.py
+++ b/edsnlp/pipelines/qualifiers/reported_speech/factory.py
@@ -1,10 +1,9 @@
-from typing import List, Optional, Set, Union
-
from spacy.language import Language
-from edsnlp.pipelines.qualifiers.reported_speech import ReportedSpeech
from edsnlp.utils.deprecation import deprecated_factory
+from .reported_speech import ReportedSpeechQualifier
+
DEFAULT_CONFIG = dict(
pseudo=None,
preceding=None,
@@ -12,51 +11,24 @@
quotation=None,
verbs=None,
attr="NORM",
+ span_getter=None,
on_ents_only=True,
within_ents=False,
explain=False,
)
-
-@deprecated_factory(
- "rspeech",
+create_component = ReportedSpeechQualifier
+create_component = deprecated_factory(
+ "reported_speech",
"eds.reported_speech",
- default_config=DEFAULT_CONFIG,
assigns=["span._.reported_speech"],
-)
-@deprecated_factory(
- "reported_speech",
+)(create_component)
+create_component = deprecated_factory(
+ "rspeech",
"eds.reported_speech",
- default_config=DEFAULT_CONFIG,
assigns=["span._.reported_speech"],
-)
-@Language.factory(
+)(create_component)
+create_component = Language.factory(
"eds.reported_speech",
- default_config=DEFAULT_CONFIG,
assigns=["span._.reported_speech"],
-)
-def create_component(
- nlp: Language,
- name: str,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- quotation: Optional[List[str]],
- verbs: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
-):
- return ReportedSpeech(
- nlp=nlp,
- attr=attr,
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- quotation=quotation,
- verbs=verbs,
- on_ents_only=on_ents_only,
- within_ents=within_ents,
- explain=explain,
- )
+)(create_component)
diff --git a/edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
index 59523f38b..0d6838c81 100644
--- a/edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
+++ b/edsnlp/pipelines/qualifiers/reported_speech/reported_speech.py
@@ -4,25 +4,72 @@
from spacy.tokens import Doc, Span, Token
from edsnlp.matchers.regex import RegexMatcher
-from edsnlp.pipelines.qualifiers.base import Qualifier
-from edsnlp.utils.filter import consume_spans, filter_spans, get_spans
+from edsnlp.pipelines.base import SpanGetterArg, get_spans
+from edsnlp.pipelines.qualifiers.base import RuleBasedQualifier
+from edsnlp.utils.filter import consume_spans, filter_spans
from edsnlp.utils.inclusion import check_inclusion
from edsnlp.utils.resources import get_verbs
-from .patterns import following, preceding, quotation, verbs
+from . import patterns
-class ReportedSpeech(Qualifier):
+class ReportedSpeechQualifier(RuleBasedQualifier):
"""
- Implements a reported speech detection algorithm.
+ The `eds.reported_speech` component uses a simple rule-based algorithm to detect
+ spans that relate to reported speech (eg when the doctor quotes the patient).
+ It was designed at AP-HP's EDS.
+
+ Examples
+ --------
+ The following snippet matches a simple terminology, and checks whether the extracted
+ entities are part of a reported speech. It is complete and can be run _as is_.
+
+ ```python
+ import spacy
+
+ nlp = spacy.blank("eds")
+ nlp.add_pipe("eds.sentences")
+ # Dummy matcher
+ nlp.add_pipe(
+ "eds.matcher",
+ config=dict(terms=dict(patient="patient", alcool="alcoolisé")),
+ )
+ nlp.add_pipe("eds.reported_speech")
+
+ text = (
+ "Le patient est admis aux urgences ce soir pour une douleur au bras. "
+ "Il nie être alcoolisé."
+ )
+
+ doc = nlp(text)
+
+ doc.ents
+ # Out: (patient, alcoolisé)
+
+ doc.ents[0]._.reported_speech
+ # Out: False
+
+ doc.ents[1]._.reported_speech
+ # Out: True
+ ```
+
+ Extensions
+ ----------
+ The `eds.reported_speech` component declares two extensions, on both `Span` and
+ `Token` objects :
- The components looks for terms indicating patient statements,
- and quotations to detect patient speech.
+ 1. The `reported_speech` attribute is a boolean, set to `True` if the component
+ predicts that the span/token is reported.
+ 2. The `reported_speech_` property is a human-readable string, computed from the
+ `reported_speech` attribute. It implements a simple getter function that outputs
+ `DIRECT` or `REPORTED`, depending on the value of `reported_speech`.
Parameters
----------
nlp : Language
spaCy nlp pipeline to use for matching.
+ name : Optional[str]
+ The component name.
quotation : str
String gathering all quotation cues.
verbs : List[str]
@@ -47,35 +94,35 @@ class ReportedSpeech(Qualifier):
Whether to consider cues within entities.
explain : bool
Whether to keep track of cues for each entity.
- """
- defaults = dict(
- following=following,
- preceding=preceding,
- verbs=verbs,
- quotation=quotation,
- )
+ Authors and citation
+ --------------------
+ The `eds.reported_speech` component was developed by AP-HP's Data Science team.
+ """
def __init__(
self,
nlp: Language,
- attr: str,
- pseudo: Optional[List[str]],
- preceding: Optional[List[str]],
- following: Optional[List[str]],
- quotation: Optional[List[str]],
- verbs: Optional[List[str]],
- on_ents_only: Union[bool, str, List[str], Set[str]],
- within_ents: bool,
- explain: bool,
+ name: Optional[str] = "eds.reported_speech",
+ *,
+ pseudo: Optional[List[str]] = None,
+ preceding: Optional[List[str]] = None,
+ following: Optional[List[str]] = None,
+ quotation: Optional[List[str]] = None,
+ verbs: Optional[List[str]] = None,
+ attr: str = "NORM",
+ span_getter: SpanGetterArg = None,
+ on_ents_only: Union[bool, str, List[str], Set[str]] = True,
+ within_ents: bool = False,
+ explain: bool = False,
):
- terms = self.get_defaults(
- pseudo=pseudo,
- preceding=preceding,
- following=following,
- quotation=quotation,
- verbs=verbs,
+ terms = dict(
+ pseudo=pseudo or [],
+ preceding=patterns.preceding if preceding is None else preceding,
+ following=patterns.following if following is None else following,
+ quotation=patterns.quotation if quotation is None else quotation,
+ verbs=patterns.verbs if verbs is None else verbs,
)
terms["verbs"] = self.load_verbs(terms["verbs"])
@@ -83,48 +130,38 @@ def __init__(
super().__init__(
nlp=nlp,
+ name=name,
attr=attr,
- on_ents_only=on_ents_only,
explain=explain,
- **terms,
+ terms=terms,
+ on_ents_only=on_ents_only,
+ span_getter=span_getter,
)
self.regex_matcher = RegexMatcher(attr=attr)
self.regex_matcher.build_patterns(dict(quotation=quotation))
self.within_ents = within_ents
-
self.set_extensions()
- @classmethod
- def set_extensions(cls) -> None:
-
- if not Token.has_extension("reported_speech"):
- Token.set_extension("reported_speech", default=False)
+ def set_extensions(self) -> None:
+ super().set_extensions()
- if not Token.has_extension("reported_speech_"):
- Token.set_extension(
- "reported_speech_",
- getter=lambda token: "REPORTED"
- if token._.reported_speech
- else "DIRECT",
- )
+ for cls in (Token, Span):
+ if not cls.has_extension("reported_speech"):
+ cls.set_extension("reported_speech", default=False)
- if not Span.has_extension("reported_speech"):
- Span.set_extension("reported_speech", default=False)
-
- if not Span.has_extension("reported_speech_"):
- Span.set_extension(
- "reported_speech_",
- getter=lambda span: "REPORTED" if span._.reported_speech else "DIRECT",
- )
+ if not cls.has_extension("reported_speech_"):
+ cls.set_extension(
+ "reported_speech_",
+ getter=lambda token: "REPORTED"
+ if token._.reported_speech
+ else "DIRECT",
+ )
if not Span.has_extension("reported_speech_cues"):
Span.set_extension("reported_speech_cues", default=[])
- if not Doc.has_extension("rspeechs"):
- Doc.set_extension("rspeechs", default=[])
-
def load_verbs(self, verbs: List[str]) -> List[str]:
"""
Conjugate reporting verbs to specific tenses (trhid person)
@@ -155,29 +192,17 @@ def load_verbs(self, verbs: List[str]) -> List[str]:
return list_rep_verbs
def process(self, doc: Doc) -> Doc:
- """
- Finds entities related to reported speech.
-
- Parameters
- ----------
- doc: spaCy Doc object
-
- Returns
- -------
- doc: spaCy Doc object, annotated for negation
- """
-
matches = self.get_matches(doc)
matches += list(self.regex_matcher(doc, as_spans=True))
boundaries = self._boundaries(doc)
- entities = self.get_spans(doc)
- ents = None
-
# Removes duplicate matches and pseudo-expressions in one statement
matches = filter_spans(matches, label_to_remove="pseudo")
+ entities = list(get_spans(doc, self.span_getter))
+ ents = None
+
for start, end in boundaries:
ents, entities = consume_spans(
@@ -193,10 +218,10 @@ def process(self, doc: Doc) -> Doc:
if self.on_ents_only and not ents:
continue
- sub_preceding = get_spans(sub_matches, "preceding")
- sub_following = get_spans(sub_matches, "following")
- sub_verbs = get_spans(sub_matches, "verbs")
- sub_quotation = get_spans(sub_matches, "quotation")
+ sub_preceding = [m for m in sub_matches if m.label_ == "preceding"]
+ sub_following = [m for m in sub_matches if m.label_ == "following"]
+ sub_verbs = [m for m in sub_matches if m.label_ == "verbs"]
+ sub_quotation = [m for m in sub_matches if m.label_ == "quotation"]
if not sub_preceding + sub_following + sub_verbs + sub_quotation:
continue
@@ -212,7 +237,6 @@ def process(self, doc: Doc) -> Doc:
)
)
for ent in ents:
-
if self.within_ents:
cues = [m for m in sub_preceding + sub_verbs if m.end <= ent.end]
cues += [m for m in sub_following if m.start >= ent.start]
@@ -229,10 +253,11 @@ def process(self, doc: Doc) -> Doc:
reported_speech = ent._.reported_speech or bool(cues)
ent._.reported_speech = reported_speech
- if self.explain:
+ if self.explain and reported_speech:
ent._.reported_speech_cues += cues
if not self.on_ents_only and reported_speech:
for token in ent:
token._.reported_speech = True
+
return doc
diff --git a/edsnlp/utils/blocs.py b/edsnlp/utils/blocs.py
deleted file mode 100644
index f00bf8de3..000000000
--- a/edsnlp/utils/blocs.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""
-Utility that extracts code blocs and runs them.
-
-Largely inspired by https://github.com/koaning/mktestdocs
-"""
-
-import re
-from pathlib import Path
-from typing import List
-
-BLOCK_PATTERN = re.compile(
- (
- r"((?P)\s+)?(?P *)"
- r"```(?P.*?)\n(?P.+?)```"
- ),
- flags=re.DOTALL,
-)
-OUTPUT_PATTERN = "# Out: "
-
-
-def check_outputs(code: str) -> str:
- """
- Looks for output patterns, and modifies the bloc:
-
- 1. The preceding line becomes `#!python v = expr`
- 2. The output line becomes an `#!python assert` statement
-
- Parameters
- ----------
- code : str
- Code block
-
- Returns
- -------
- str
- Modified code bloc with assert statements
- """
-
- lines: List[str] = code.split("\n")
- code = []
-
- skip = False
-
- if len(lines) < 2:
- return code
-
- for expression, output in zip(lines[:-1], lines[1:]):
- if skip:
- skip = not skip
- continue
-
- if output.startswith(OUTPUT_PATTERN):
- expression = f"v = {expression}"
-
- output = output[len(OUTPUT_PATTERN) :].replace('"', r"\"")
- output = f'assert repr(v) == "{output}" or str(v) == "{output}"'
-
- code.append(expression)
- code.append(output)
-
- skip = True
-
- else:
- code.append(expression)
-
- if not skip:
- code.append(output)
-
- return "\n".join(code)
-
-
-def remove_indentation(code: str, indent: int) -> str:
- """
- Remove indentation from a code bloc.
-
- Parameters
- ----------
- code : str
- Code bloc
- indent : int
- Level of indentation
-
- Returns
- -------
- str
- Modified code bloc
- """
-
- if not indent:
- return code
-
- lines = []
-
- for line in code.split("\n"):
- lines.append(line[indent:])
-
- return "\n".join(lines)
-
-
-def grab_code_blocks(docstring: str, lang="python") -> List[str]:
- """
- Given a docstring, grab all the markdown codeblocks found in docstring.
-
- Parameters
- ----------
- docstring : str
- Full text.
- lang : str, optional
- Language to execute, by default "python"
-
- Returns
- -------
- List[str]
- Extracted code blocks
- """
- codeblocks = []
-
- for match in BLOCK_PATTERN.finditer(docstring):
- d = match.groupdict()
-
- if d["skip"]:
- continue
-
- if lang in d["title"]:
- code = remove_indentation(d["code"], len(d["indent"]))
- code = check_outputs(code)
- codeblocks.append(code)
-
- return codeblocks
-
-
-def printer(code: str) -> None:
- """
- Prints a code bloc with lines for easier debugging.
-
- Parameters
- ----------
- code : str
- Code bloc.
- """
- lines = []
- for i, line in enumerate(code.split("\n")):
- lines.append(f"{i + 1:03} {line}")
-
- print("\n".join(lines))
-
-
-def check_docstring(obj, lang=""):
- """
- Given a function, test the contents of the docstring.
- """
- for b in grab_code_blocks(obj.__doc__, lang=lang):
- try:
- exec(b, {"__MODULE__": "__main__"})
- except Exception:
- print(f"Error Encountered in `{obj.__name__}`. Caused by:\n")
- printer(b)
- raise
-
-
-def check_raw_string(raw, lang="python"):
- """
- Given a raw string, test the contents.
- """
- for b in grab_code_blocks(raw, lang=lang):
- try:
- exec(b, {"__MODULE__": "__main__"})
- except Exception:
- printer(b)
- raise
-
-
-def check_raw_file_full(raw, lang="python"):
- all_code = "\n".join(grab_code_blocks(raw, lang=lang))
- try:
- exec(all_code, {"__MODULE__": "__main__"})
- except Exception:
- printer(all_code)
- raise
-
-
-def check_md_file(path: Path, memory: bool = False) -> None:
- """
- Given a markdown file, parse the contents for Python code blocs
- and check that each independant bloc does not cause an error.
-
- Parameters
- ----------
- path : Path
- Path to the markdown file to execute.
- memory : bool, optional
- Whether to keep results from one bloc to the next, by default `#!python False`
- """
- text = Path(path).read_text()
- if memory:
- check_raw_file_full(text, lang="python")
- else:
- check_raw_string(text, lang="python")
diff --git a/edsnlp/utils/collections.py b/edsnlp/utils/collections.py
index 47db54aea..e69de29bb 100644
--- a/edsnlp/utils/collections.py
+++ b/edsnlp/utils/collections.py
@@ -1,13 +0,0 @@
-def dedup(sequence, key=None):
- """
- Deduplicate a sequence, keeping the last occurrence of each item.
-
- Parameters
- ----------
- sequence : Sequence
- Sequence to deduplicate
- key : Callable, optional
- Key function to use for deduplication, by default None
- """
- key = (lambda x: x) if key is None else key
- return list({key(item): item for item in sequence}.values())
diff --git a/edsnlp/viz/quick_examples.py b/edsnlp/viz/quick_examples.py
index e1053bc8e..40067b208 100644
--- a/edsnlp/viz/quick_examples.py
+++ b/edsnlp/viz/quick_examples.py
@@ -49,12 +49,12 @@ def __call__(
def get_ents(self):
all_spans = {k: list(s) for k, s in self.doc.spans.items() if s}
- all_spans["ents"] = list(self.doc.ents).copy()
+ all_spans["ents"] = list(self.doc.ents)
ents = []
for key, spans in all_spans.items():
- for span in spans:
+ for span in list(spans):
if span in all_spans["ents"]:
all_spans["ents"].remove(span)
start, end = span.start, span.end
diff --git a/mkdocs.yml b/mkdocs.yml
index 21f0d0cd0..fdac65d4a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -32,6 +32,7 @@ theme:
# - navigation.tabs
- navigation.top
- content.code.annotate
+ - content.code.copy
nav:
- index.md
@@ -52,25 +53,25 @@ nav:
- advanced-tutorials/word-vectors.md
- advanced-tutorials/fastapi.md
- Creating a pipeline: https://spacy.io/usage/processing-pipelines#custom-components" target="_blank
- - Pipelines:
- - pipelines/index.md
- - pipelines/architecture.md
+ - Pipes:
+ - Overview: pipelines/overview.md
- Core Pipelines:
- - pipelines/core/index.md
+ - Overview: pipelines/core/overview.md
+ - pipelines/core/normalizer.md
+ - pipelines/core/sentences.md
- pipelines/core/matcher.md
- - pipelines/core/contextual-matcher.md
- pipelines/core/terminology.md
- - pipelines/core/normalisation.md
+ - pipelines/core/contextual-matcher.md
- pipelines/core/endlines.md
- - pipelines/core/sentences.md
- Qualifiers:
- - pipelines/qualifiers/index.md
+ - Overview: pipelines/qualifiers/overview.md
- pipelines/qualifiers/negation.md
- pipelines/qualifiers/family.md
- pipelines/qualifiers/hypothesis.md
- pipelines/qualifiers/reported-speech.md
- pipelines/qualifiers/history.md
- Miscellaneous:
+ - Overview: pipelines/misc/overview.md
- pipelines/misc/index.md
- pipelines/misc/dates.md
- pipelines/misc/measurements.md
@@ -79,35 +80,43 @@ nav:
- pipelines/misc/reason.md
- pipelines/misc/tables.md
- Named Entity Recognition:
- - pipelines/ner/index.md
- - pipelines/ner/score.md
- - pipelines/ner/covid.md
- - pipelines/ner/drugs.md
- - pipelines/ner/adicap.md
- - pipelines/ner/cim10.md
- - pipelines/ner/umls.md
+ - Overview: pipelines/ner/overview.md
+ - Scores:
+ - Overview: pipelines/ner/scores/overview.md
+ - pipelines/ner/scores/charlson.md
+ - pipelines/ner/scores/emergency-ccmu.md
+ - pipelines/ner/scores/emergency-gemsa.md
+ - pipelines/ner/scores/emergency-priority.md
+ - pipelines/ner/scores/sofa.md
+ - pipelines/ner/scores/elston-ellis.md
- Disorders:
- - pipelines/ner/disorders/index.md
- - pipelines/ner/disorders/AIDS.md
- - pipelines/ner/disorders/CKD.md
- - pipelines/ner/disorders/COPD.md
- - pipelines/ner/disorders/cerebrovascular_accident.md
- - pipelines/ner/disorders/congestive_heart_failure.md
- - pipelines/ner/disorders/connective_tissue_disease.md
+ - Overview: pipelines/ner/disorders/overview.md
+ - pipelines/ner/disorders/aids.md
+ - pipelines/ner/disorders/ckd.md
+ - pipelines/ner/disorders/copd.md
+ - pipelines/ner/disorders/cerebrovascular-accident.md
+ - pipelines/ner/disorders/congestive-heart-failure.md
+ - pipelines/ner/disorders/connective-tissue-disease.md
- pipelines/ner/disorders/dementia.md
- pipelines/ner/disorders/diabetes.md
- pipelines/ner/disorders/hemiplegia.md
- pipelines/ner/disorders/leukemia.md
- - pipelines/ner/disorders/liver_disease.md
+ - pipelines/ner/disorders/liver-disease.md
- pipelines/ner/disorders/lymphoma.md
- - pipelines/ner/disorders/myocardial_infarction.md
- - pipelines/ner/disorders/peptic_ulcer_disease.md
- - pipelines/ner/disorders/peripheral_vascular_disease.md
- - pipelines/ner/disorders/solid_tumor.md
+ - pipelines/ner/disorders/myocardial-infarction.md
+ - pipelines/ner/disorders/peptic-ulcer-disease.md
+ - pipelines/ner/disorders/peripheral-vascular-disease.md
+ - pipelines/ner/disorders/solid-tumor.md
+ - pipelines/ner/covid.md
- Behaviors:
- - pipelines/ner/behaviors/index.md
+ - Overview: pipelines/ner/behaviors/overview.md
- pipelines/ner/behaviors/alcohol.md
- pipelines/ner/behaviors/tobacco.md
+ - pipelines/ner/adicap.md
+ - pipelines/ner/tnm.md
+ - pipelines/ner/umls.md
+ - pipelines/ner/cim10.md
+ - pipelines/ner/drugs.md
- Trainable components:
- pipelines/trainable/index.md
@@ -155,27 +164,35 @@ watch:
- contributing.md
- changelog.md
- edsnlp
+ - docs/scripts
hooks:
- docs/scripts/plugin.py
plugins:
- search
- - bibtex:
- bib_file: "docs/references.bib"
- - autorefs
+ - autorefs:
+ priority:
+ - '*'
+ - reference
- mkdocstrings:
+ enable_inventory: true
custom_templates: docs/assets/templates
handlers:
python:
options:
+ extensions:
+ - docs/scripts/griffe_ext.py:EDSNLPDocstrings
docstring_style: numpy
docstring_section_style: spacy
heading_level: 2
members_order: source
show_root_toc_entry: false
- watch:
- - "edsnlp"
+ show_signature: false
+ merge_init_into_class: true
+ - autolinks
+ - bibtex:
+ bibtex_file: "docs/references.bib"
# Just uncomment the following lines to enable i18n
# and start creating .fr.md and .en.md files.
# - i18n:
@@ -188,9 +205,9 @@ plugins:
- mike
markdown_extensions:
+ - pymdownx.highlight
- admonition
- pymdownx.superfences
- - pymdownx.highlight
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.tabbed:
@@ -204,3 +221,6 @@ markdown_extensions:
- pymdownx.emoji:
emoji_index: !!python/name:materialx.emoji.twemoji
emoji_generator: !!python/name:materialx.emoji.to_svg
+
+validation:
+ absolute_links: ignore
diff --git a/pyproject.toml b/pyproject.toml
index 0c7d0c603..1c5150953 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,21 +41,24 @@ dev = [
"pytest-cov>=3.0.0,<4.0.0",
"pytest-html>=3.1.1,<4.0.0",
"torch>=1.0.0",
-]
-setup = [
- "mlconjug3<3.9.0",
- "typer"
-]
-docs = [
- "mike==1.1.2",
- "mkdocs-autorefs~=0.4.1",
- "mkdocs-bibtex==2.8.5",
+
+ # docs
+ "mike~=1.1.2",
"mkdocs-charts-plugin==0.0.8",
"mkdocs-img2fig-plugin==0.9.3",
"mkdocs-material~=9.1.0",
"mkdocs-section-index==0.3.4",
- "mkdocstrings~=0.21.2",
- "mkdocstrings-python~=0.10.1",
+ "mkdocs~=1.5.2",
+ "mkdocstrings~=0.20",
+ "mkdocstrings-python~=1.1",
+ "mkdocs-autolinks-plugin~=0.7.1",
+ "pybtex~=0.24.0",
+ "pathspec>=0.11.1", # required by vendored mkdocs-autorefs PR
+ "astunparse",
+]
+setup = [
+ "mlconjug3<3.9.0",
+ "typer"
]
[project.urls]
@@ -82,58 +85,114 @@ version = { attr = "edsnlp.__version__" }
where = ["."]
[project.entry-points."spacy_factories"]
-"matcher" = "edsnlp.components:matcher"
-"terminology" = "edsnlp.components:terminology"
-"contextual_matcher" = "edsnlp.components:contextual_matcher"
-"endlines" = "edsnlp.components:endlines"
-"sentences" = "edsnlp.components:sentences"
-"normalizer" = "edsnlp.components:normalizer"
-"accents" = "edsnlp.components:accents"
-"lowercase" = "edsnlp.components:remove_lowercase"
-"pollution" = "edsnlp.components:pollution"
-"quotes" = "edsnlp.components:quotes"
-"charlson" = "edsnlp.components:charlson"
-"sofa" = "edsnlp.components:sofa"
-"tnm" = "edsnlp.components:tnm"
-"priority" = "edsnlp.components:priority"
-"ccmu" = "edsnlp.components:ccmu"
-"gemsa" = "edsnlp.components:gemsa"
-"covid" = "edsnlp.components:covid"
-"cim10" = "edsnlp.components:cim10"
-"history" = "edsnlp.components:history"
-"family" = "edsnlp.components:family"
-"hypothesis" = "edsnlp.components:hypothesis"
-"negation" = "edsnlp.components:negation"
-"rspeech" = "edsnlp.components:rspeech"
-"consultation_dates" = "edsnlp.components:consultation_dates"
-"dates" = "edsnlp.components:dates"
-"reason" = "edsnlp.components:reason"
-"sections" = "edsnlp.components:sections"
-"context" = "edsnlp.components:context"
-"measurements" = "edsnlp.components:measurements"
-"drugs" = "edsnlp.components:drugs"
-"nested_ner" = "edsnlp.components:nested_ner"
-"eds.span_qualifier" = "edsnlp.pipelines.trainable.span_qualifier.factory:create_component"
-"adicap" = "edsnlp.components:adicap"
-"umls" = "edsnlp.components:umls"
-"diabetes" = "edsnlp.components:diabetes"
-"tobacco" = "edsnlp.components:tobacco"
-"AIDS" = "edsnlp.components:AIDS"
-"lymphoma" = "edsnlp.components:lymphoma"
-"leukemia" = "edsnlp.components:leukemia"
-"solid_tumor" = "edsnlp.components:solid_tumor"
-"CKD" = "edsnlp.components:CKD"
-"hemiplegia" = "edsnlp.components:hemiplegia"
-"liver_disease" = "edsnlp.components:liver_disease"
-"peptic_ulcer_disease" = "edsnlp.components:peptic_ulcer_disease"
-"connective_tissue_disease" = "edsnlp.components:connective_tissue_disease"
-"COPD" = "edsnlp.components:COPD"
-"dementia" = "edsnlp.components:dementia"
-"cerebrovascular_accident" = "edsnlp.components:cerebrovascular_accident"
-"peripheral_vascular_disease" = "edsnlp.components:peripheral_vascular_disease"
-"congestive_heart_failure" = "edsnlp.components:congestive_heart_failure"
-"myocardial_infarction" = "edsnlp.components:myocardial_infarction"
-"alcohol" = "edsnlp.components:alcohol"
+# Core
+"eds.accents" = "edsnlp.pipelines.core.normalizer.accents.factory:create_component"
+"eds.context" = "edsnlp.pipelines.core.context.factory:create_component"
+"eds.contextual_matcher" = "edsnlp.pipelines.core.contextual_matcher.factory:create_component"
+"eds.endlines" = "edsnlp.pipelines.core.endlines.factory:create_component"
+"eds.matcher" = "edsnlp.pipelines.core.matcher.factory:create_component"
+"eds.normalizer" = "edsnlp.pipelines.core.normalizer.factory:create_component"
+"eds.pollution" = "edsnlp.pipelines.core.normalizer.pollution.factory:create_component"
+"eds.quotes" = "edsnlp.pipelines.core.normalizer.quotes.factory:create_component"
+"eds.remove_lowercase" = "edsnlp.pipelines.core.normalizer.remove_lowercase.factory:create_component"
+"eds.sentences" = "edsnlp.pipelines.core.sentences.factory:create_component"
+"eds.spaces" = "edsnlp.pipelines.core.normalizer.spaces.factory:create_component"
+"eds.terminology" = "edsnlp.pipelines.core.terminology.factory:create_component"
+
+# NER
+"eds.adicap" = "edsnlp.pipelines.ner.adicap.factory:create_component"
+"eds.ccmu" = "edsnlp.pipelines.ner.scores.emergency.ccmu.factory:create_component"
+"eds.charlson" = "edsnlp.pipelines.ner.scores.charlson.factory:create_component"
+"eds.cim10" = "edsnlp.pipelines.ner.cim10.factory:create_component"
+"eds.covid" = "edsnlp.pipelines.ner.covid.factory:create_component"
+"eds.drugs" = "edsnlp.pipelines.ner.drugs.factory:create_component"
+"eds.elston_ellis" = "edsnlp.pipelines.ner.scores.elston_ellis.factory:create_component"
+"eds.gemsa" = "edsnlp.pipelines.ner.scores.emergency.gemsa.factory:create_component"
+"eds.priority" = "edsnlp.pipelines.ner.scores.emergency.priority.factory:create_component"
+"eds.score" = "edsnlp.pipelines.ner.scores.factory:create_component"
+"eds.sofa" = "edsnlp.pipelines.ner.scores.sofa.factory:create_component"
+"eds.tnm" = "edsnlp.pipelines.ner.tnm.factory:create_component"
+"eds.umls" = "edsnlp.pipelines.ner.umls.factory:create_component"
+
+# NER/Comorbidities
+"eds.aids" = "edsnlp.pipelines.ner.disorders.aids.factory:create_component"
+"eds.alcohol" = "edsnlp.pipelines.ner.behaviors.alcohol.factory:create_component"
+"eds.cerebrovascular_accident" = "edsnlp.pipelines.ner.disorders.cerebrovascular_accident.factory:create_component"
+"eds.ckd" = "edsnlp.pipelines.ner.disorders.ckd.factory:create_component"
+"eds.congestive_heart_failure" = "edsnlp.pipelines.ner.disorders.congestive_heart_failure.factory:create_component"
+"eds.connective_tissue_disease" = "edsnlp.pipelines.ner.disorders.connective_tissue_disease.factory:create_component"
+"eds.copd" = "edsnlp.pipelines.ner.disorders.copd.factory:create_component"
+"eds.dementia" = "edsnlp.pipelines.ner.disorders.dementia.factory:create_component"
+"eds.diabetes" = "edsnlp.pipelines.ner.disorders.diabetes.factory:create_component"
+"eds.hemiplegia" = "edsnlp.pipelines.ner.disorders.hemiplegia.factory:create_component"
+"eds.leukemia" = "edsnlp.pipelines.ner.disorders.leukemia.factory:create_component"
+"eds.liver_disease" = "edsnlp.pipelines.ner.disorders.liver_disease.factory:create_component"
+"eds.lymphoma" = "edsnlp.pipelines.ner.disorders.lymphoma.factory:create_component"
+"eds.myocardial_infarction" = "edsnlp.pipelines.ner.disorders.myocardial_infarction.factory:create_component"
+"eds.peptic_ulcer_disease" = "edsnlp.pipelines.ner.disorders.peptic_ulcer_disease.factory:create_component"
+"eds.peripheral_vascular_disease" = "edsnlp.pipelines.ner.disorders.peripheral_vascular_disease.factory:create_component"
+"eds.solid_tumor" = "edsnlp.pipelines.ner.disorders.solid_tumor.factory:create_component"
+"eds.tobacco" = "edsnlp.pipelines.ner.behaviors.tobacco.factory:create_component"
+
+# Qualifiers
+"eds.family" = "edsnlp.pipelines.qualifiers.family.factory:create_component"
+"eds.history" = "edsnlp.pipelines.qualifiers.history.factory:create_component"
+"eds.hypothesis" = "edsnlp.pipelines.qualifiers.hypothesis.factory:create_component"
+"eds.negation" = "edsnlp.pipelines.qualifiers.negation.factory:create_component"
+"eds.reported_speech" = "edsnlp.pipelines.qualifiers.reported_speech.factory:create_component"
+
+# Misc
+"eds.consultation_dates" = "edsnlp.pipelines.misc.consultation_dates.factory:create_component"
+"eds.dates" = "edsnlp.pipelines.misc.dates.factory:create_component"
+"eds.measurements" = "edsnlp.pipelines.misc.measurements.factory:create_component"
+"eds.reason" = "edsnlp.pipelines.misc.reason.factory:create_component"
+"eds.sections" = "edsnlp.pipelines.misc.sections.factory:create_component"
+"eds.tables" = "edsnlp.pipelines.misc.tables.factory:create_component"
+
+# Trainable
+"eds.nested_ner" = "edsnlp.pipelines.trainable.nested_ner.factory:create_component"
+"eds.span_qualifier" = "edsnlp.pipelines.trainable.span_qualifier.factory:create_component"
+
+# Deprecated (links to the same factories as above)
+"SOFA" = "edsnlp.pipelines.ner.scores.sofa.factory:create_component"
+"accents" = "edsnlp.pipelines.core.normalizer.accents.factory:create_component"
+"charlson" = "edsnlp.pipelines.ner.scores.charlson.factory:create_component"
+"consultation_dates" = "edsnlp.pipelines.misc.consultation_dates.factory:create_component"
+"contextual-matcher" = "edsnlp.pipelines.core.contextual_matcher.factory:create_component"
+"dates" = "edsnlp.pipelines.misc.dates.factory:create_component"
+"eds.AIDS" = "edsnlp.pipelines.ner.disorders.aids.factory:create_component"
+"eds.CKD" = "edsnlp.pipelines.ner.disorders.ckd.factory:create_component"
+"eds.COPD" = "edsnlp.pipelines.ner.disorders.copd.factory:create_component"
+"eds.SOFA" = "edsnlp.pipelines.ner.scores.sofa.factory:create_component"
+"eds.TNM" = "edsnlp.pipelines.ner.tnm.factory:create_component"
+"eds.elston-ellis" = "edsnlp.pipelines.ner.scores.elston_ellis.factory:create_component"
+"eds.elstonellis" = "edsnlp.pipelines.ner.scores.elston_ellis.factory:create_component"
+"eds.emergency.ccmu" = "edsnlp.pipelines.ner.scores.emergency.ccmu.factory:create_component"
+"eds.emergency.gemsa" = "edsnlp.pipelines.ner.scores.emergency.gemsa.factory:create_component"
+"eds.emergency.priority" = "edsnlp.pipelines.ner.scores.emergency.priority.factory:create_component"
+"eds.measures" = "edsnlp.pipelines.misc.measurements.factory:create_component"
+"eds.remove-lowercase" = "edsnlp.pipelines.core.normalizer.remove_lowercase.factory:create_component"
+"emergency.ccmu" = "edsnlp.pipelines.ner.scores.emergency.ccmu.factory:create_component"
+"emergency.gemsa" = "edsnlp.pipelines.ner.scores.emergency.gemsa.factory:create_component"
+"emergency.priority" = "edsnlp.pipelines.ner.scores.emergency.priority.factory:create_component"
+"endlines" = "edsnlp.pipelines.core.endlines.factory:create_component"
+"family" = "edsnlp.pipelines.qualifiers.family.factory:create_component"
+"hypothesis" = "edsnlp.pipelines.qualifiers.hypothesis.factory:create_component"
+"matcher" = "edsnlp.pipelines.core.matcher.factory:create_component"
+"negation" = "edsnlp.pipelines.qualifiers.negation.factory:create_component"
+"normalizer" = "edsnlp.pipelines.core.normalizer.factory:create_component"
+"pollution" = "edsnlp.pipelines.core.normalizer.pollution.factory:create_component"
+"quotes" = "edsnlp.pipelines.core.normalizer.quotes.factory:create_component"
+"reason" = "edsnlp.pipelines.misc.reason.factory:create_component"
+"remove-lowercase" = "edsnlp.pipelines.core.normalizer.remove_lowercase.factory:create_component"
+"reported_speech" = "edsnlp.pipelines.qualifiers.reported_speech.factory:create_component"
+"rspeech" = "edsnlp.pipelines.qualifiers.reported_speech.factory:create_component"
+"score" = "edsnlp.pipelines.ner.scores.factory:create_component"
+"sections" = "edsnlp.pipelines.misc.sections.factory:create_component"
+"sentences" = "edsnlp.pipelines.core.sentences.factory:create_component"
+"spaces" = "edsnlp.pipelines.core.normalizer.spaces.factory:create_component"
+"tables" = "edsnlp.pipelines.misc.tables.factory:create_component"
+"terminology" = "edsnlp.pipelines.core.terminology.factory:create_component"
[project.entry-points."spacy_architectures"]
"eds.stack_crf_ner_model.v1" = "edsnlp.pipelines.trainable.nested_ner.stack_crf_ner:create_model"
@@ -149,6 +208,10 @@ where = ["."]
[project.entry-points."spacy_languages"]
"eds" = "edsnlp.language:EDSLanguage"
+[project.entry-points."mkdocs.plugins"]
+"bibtex" = "docs.scripts.bibtex:BibTexPlugin"
+"autorefs" = "docs.scripts.autorefs.plugin:AutorefsPlugin"
+
[build-system]
requires = [
"setuptools",
@@ -177,6 +240,37 @@ requires = [
]
build-backend = "setuptools.build_meta"
+[tool.ruff]
+fix = true
+extend-exclude = [
+ ".git",
+ "__pycache__",
+ "__init__.py",
+ ".mypy_cache",
+ ".pytest_cache",
+ ".venv",
+ "build",
+ "edsnlp/pipelines/factories.py",
+]
+line-length = 88
+select = [
+ "E",
+ "F",
+ "W",
+ "I001"
+]
+
+[tool.ruff.flake8-tidy-imports]
+ban-relative-imports = "parents"
+
+[tool.ruff.extend-per-file-ignores]
+"__init__.py" = ["F401"]
+"edsnlp/pipelines/factories.py" = [ "F401", "E501" ]
+
+[tool.ruff.isort]
+known-first-party = ["edsnlp"]
+order-by-type = true
+
[tool.interrogate]
ignore-init-method = true
ignore-init-module = true
@@ -199,16 +293,17 @@ omit-covered-files = false
# badge-format = "svg"
-[tool.coverage]
+[tool.coverage.report]
exclude_lines = [
+ "def __repr__",
"if __name__ == .__main__.:",
- "if TYPE_CHECKING:",
- "if typing.TYPE_CHECKING:",
"@overload",
"pragma: no cover",
- "raise AssertionError",
- "raise NotImplementedError",
- "def __repr__",
+ "raise .*Error",
+ "if __name__ == .__main__.:",
+ "if TYPE_CHECKING:",
+ "class .*\\bProtocol\\):",
+ "@(abc\\.)?abstractmethod",
"Span.set_extension.*",
"Doc.set_extension.*",
"Token.set_extension.*",
diff --git a/tests/extract_docs_code.py b/tests/extract_docs_code.py
new file mode 100644
index 000000000..1faec44ef
--- /dev/null
+++ b/tests/extract_docs_code.py
@@ -0,0 +1,156 @@
+import re
+import shutil
+import tempfile
+from textwrap import dedent
+from typing import Tuple
+
+from markdown.extensions import Extension
+from markdown.extensions.attr_list import get_attrs
+from markdown.extensions.codehilite import parse_hl_lines
+from markdown.extensions.fenced_code import FencedBlockPreprocessor
+from mkdocs.commands.build import build
+from mkdocs.config import load_config
+from mkdocs.config.config_options import Type as MkType
+from mkdocs.config.defaults import MkDocsConfig
+from mkdocs.plugins import BasePlugin
+from mkdocstrings.extension import AutoDocProcessor
+from mkdocstrings.plugin import MkdocstringsPlugin
+
+BRACKET_RE = re.compile(r"\[([^\[]+)\]")
+CITE_RE = re.compile(r"@([\w_:-]+)")
+DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
+INDENT_RE = re.compile(r"\A\t| {4}(.*)")
+
+CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
+
+
+class PyCodePreprocessor(FencedBlockPreprocessor):
+ """Gather reference definitions and citation keys"""
+
+ FENCED_BLOCK_RE = re.compile(
+ dedent(
+ r"""
+ (?P^[ ]*(?:~{3,}|`{3,}))[ ]* # opening fence
+ ((\{(?P[^\}\n]*)\})| # (optional {attrs} or
+ (\.?(?P[\w#.+-]*)[ ]*)? # optional (.)lang
+ (hl_lines=(?P"|')(?P.*?)(?P=quot)[ ]*)?) # optional hl_lines)
+ \n # newline (end of opening fence)
+ (?P.*?)(?<=\n) # the code block
+ (?P=fence)[ ]*$ # closing fence
+ """ # noqa: E501
+ ),
+ re.MULTILINE | re.DOTALL | re.VERBOSE,
+ )
+
+ def __init__(self, md, code_blocks):
+ super().__init__(md, {})
+ self.code_blocks = code_blocks
+
+ def run(self, lines):
+ text = "\n".join(lines)
+ if 'nlp.add_pipe(f"eds.aids")' in text:
+ print("TEXT", text)
+ while True:
+ # ---- https://github.com/Python-Markdown/markdown/blob/5a2fee/markdown/extensions/fenced_code.py#L84C9-L98 # noqa: E501
+ m = self.FENCED_BLOCK_RE.search(text)
+ if 'nlp.add_pipe(f"eds.aids")' in text:
+ print("CODE ==>", m.group("code") if m else None)
+ if m:
+ lang, id, classes, config = None, "", [], {}
+ if m.group("attrs"):
+ id, classes, config = self.handle_attrs(get_attrs(m.group("attrs")))
+ if len(classes):
+ lang = classes.pop(0)
+ else:
+ if m.group("lang"):
+ lang = m.group("lang")
+ if m.group("hl_lines"):
+ # Support `hl_lines` outside of `attrs` for
+ # backward-compatibility
+ config["hl_lines"] = parse_hl_lines(m.group("hl_lines"))
+ # ----
+ code = m.group("code")
+
+ if lang == "python" and "no-check" not in classes:
+ self.code_blocks.append(dedent(code))
+ else:
+ break
+ text = text[m.end() :]
+
+ return lines
+
+
+context_citations = None
+
+
+class PyCodeExtension(Extension):
+ def __init__(self, code_blocks):
+ super(PyCodeExtension, self).__init__()
+ self.code_blocks = code_blocks
+
+ def extendMarkdown(self, md):
+ self.md = md
+ md.registerExtension(self)
+ md.preprocessors.register(
+ PyCodePreprocessor(md, self.code_blocks), "fenced_code", 31
+ )
+ for ext in md.registeredExtensions:
+ if isinstance(ext, AutoDocProcessor):
+ ext._config["mdx"].append(self)
+
+
+def makeExtension(*args, **kwargs):
+ return PyCodeExtension(*args, **kwargs)
+
+
+class PyCodeExtractorPlugin(BasePlugin):
+ config_scheme: Tuple[Tuple[str, MkType]] = (
+ # ("bibtex_file", MkType(str)), # type: ignore[assignment]
+ # ("order", MkType(str, default="unsorted")), # type: ignore[assignment]
+ )
+
+ def __init__(self, global_config):
+ self.global_config = global_config
+ self.page_code_blocks = []
+ self.docs_code_blocks = []
+
+ def on_config(self, config: MkDocsConfig):
+ self.ext = PyCodeExtension(self.page_code_blocks)
+ # After pymdownx.highlight, because of weird registering deleting the first
+ # extension
+ config["markdown_extensions"].append(self.ext)
+ config["markdown_extensions"].remove("pymdownx.highlight")
+ config["markdown_extensions"].remove("fenced_code")
+
+ def on_pre_build(self, *, config: MkDocsConfig):
+ mkdocstrings_plugin: MkdocstringsPlugin = config.plugins["mkdocstrings"]
+ mkdocstrings_plugin.get_handler("python")
+
+ def on_page_content(self, html, page, config, files):
+ if len(self.page_code_blocks):
+ self.docs_code_blocks.append((page.url, "\n".join(self.page_code_blocks)))
+ self.page_code_blocks.clear()
+ return html
+
+
+def extract_docs_code():
+ config = load_config()
+
+ temp_dir = tempfile.mkdtemp()
+ try:
+ config["site_dir"] = temp_dir
+
+ # plug the pycode extractor plugin
+ plugin = PyCodeExtractorPlugin(config)
+ config.plugins["pycode_extractor"] = plugin
+
+ config["plugins"].run_event("startup", command="build", dirty=False)
+ try:
+ build(config)
+ finally:
+ config["plugins"].run_event("shutdown")
+
+ finally:
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+ return plugin.docs_code_blocks
diff --git a/tests/pipelines/core/test_contextual_matcher.py b/tests/pipelines/core/test_contextual_matcher.py
index 6b615c673..7f4aaf6e7 100644
--- a/tests/pipelines/core/test_contextual_matcher.py
+++ b/tests/pipelines/core/test_contextual_matcher.py
@@ -212,6 +212,7 @@ def test_contextual(blank_nlp, params, example):
"eds.contextual-matcher",
name="Cancer",
config=dict(
+ label="Cancer",
patterns=patterns,
include_assigned=include_assigned,
),
diff --git a/tests/pipelines/core/test_endlines.py b/tests/pipelines/core/test_endlines.py
index febd7215c..6ce594a30 100644
--- a/tests/pipelines/core/test_endlines.py
+++ b/tests/pipelines/core/test_endlines.py
@@ -1,8 +1,8 @@
import spacy
from pytest import fixture
-from edsnlp.pipelines.core.endlines.endlinesmodel import EndLinesModel
from edsnlp.pipelines.core.endlines.functional import build_path
+from edsnlp.pipelines.core.endlines.model import EndLinesModel
texts = [
"""Le patient est arrivé hier soir.
diff --git a/tests/pipelines/misc/test_consultation_date.py b/tests/pipelines/misc/test_consultation_date.py
index 0bab6657b..4fad33336 100644
--- a/tests/pipelines/misc/test_consultation_date.py
+++ b/tests/pipelines/misc/test_consultation_date.py
@@ -68,7 +68,7 @@ def test_cons_dates(date_pipeline, example, blank_nlp):
doc = blank_nlp(TEXT)
- assert len(doc.spans["dates"]) == 5 or not date_pipeline
+ assert not date_pipeline or len(doc.spans["dates"]) == 5
assert len(doc.spans["consultation_dates"]) == len(example["result"])
diff --git a/tests/pipelines/misc/test_dates.py b/tests/pipelines/misc/test_dates.py
index 54f0057a8..eb755d5c9 100644
--- a/tests/pipelines/misc/test_dates.py
+++ b/tests/pipelines/misc/test_dates.py
@@ -6,7 +6,7 @@
from pytest import fixture
from spacy.language import Language
-from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Direction, Mode
+from edsnlp.pipelines.misc.dates.models import AbsoluteDate, Relative
from edsnlp.utils.examples import parse_example
TZ = pytz.timezone("Europe/Paris")
@@ -16,10 +16,10 @@
"Le patient est venu en 2019 pour une "
"consultation"
),
- "Le patient est venu hier",
+ "Le patient est venu hier",
"le 04/09/2021",
(
- "Il est cas contact "
+ "Il est cas contact "
"depuis la semaine dernière"
),
"le 09/08",
@@ -34,30 +34,30 @@
"pour..."
),
(
- "Il est venu il y a "
+ "Il est venu il y a "
"trois mois pour..."
),
(
"Il lui était arrivé la même chose il y a un an."
+ "direction=past year=1>il y a un an."
),
(
"Il est venu le 20/09/2001 pour..."
),
(
- "Consultation du 03 07 19"
),
"En 11/2017 stabilité sur...",
- "depuis 3 mois",
+ "depuis 3 mois",
"- Décembre 2004 :",
"- Juin 2005: ",
# "-sept 2017 :",
(
- "il y a 1 an "
- "pdt 1 mois"
+ "il y a 1 an "
+ "pdt 1 mois"
),
(
"Prélevé le : 22/04/2016 "
@@ -65,7 +65,7 @@
),
"Le 07/01.",
"Il est venu en août.",
- "Il est venu ce jour.",
+ "Il est venu ce jour.",
"CS le 11-01-2017 1/3",
"Vu le 11 janvier\n2017 .",
]
@@ -83,31 +83,33 @@ def test_dates_component(blank_nlp: Language):
text, entities = parse_example(example)
doc = blank_nlp(text)
+ spans = sorted(doc.spans["dates"] + doc.spans["durations"])
- assert len(doc.spans["dates"]) == len(entities)
+ assert len(spans) == len(entities)
assert len(doc.ents) == len(entities)
- for span, entity in zip(doc.spans["dates"], entities):
+ for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
- date = span._.date
+ date = span._.date if span.label_ == "date" else span._.duration
d = {modifier.key: modifier.value for modifier in entity.modifiers}
norm = d.pop("norm")
if "direction" in d:
- d["direction"] = Direction[d["direction"]]
- if "mode" in d:
- d["mode"] = Mode[d["mode"]]
+ d["mode"] = "relative"
+ if "mode" not in d:
+ d["mode"] = "absolute"
assert date.dict(exclude_none=True) == d
assert date.norm() == norm
set_d = set(d)
+ d.pop("mode", None)
+ d.pop("direction", None)
+ d.pop("bound", None)
if isinstance(date, AbsoluteDate) and {"year", "month", "day"}.issubset(
set_d
):
- d.pop("direction", None)
- d.pop("mode", None)
assert date.to_datetime() == TZ.localize(datetime(**d))
elif isinstance(date, AbsoluteDate):
@@ -143,48 +145,11 @@ def test_dates_component(blank_nlp: Language):
note_datetime=note_datetime, infer_from_context=True
) == TZ.localize(datetime(**d))
- if isinstance(date, AbsoluteDate) and {"year", "month", "day"}.issubset(
- set_d
- ):
- d.pop("direction", None)
- d.pop("mode", None)
- assert date.to_datetime() == TZ.localize(datetime(**d))
-
- elif isinstance(date, AbsoluteDate):
+ elif isinstance(date, Relative):
assert date.to_datetime() is None
-
- # no year
- if {"month", "day"}.issubset(set_d) and {"year"}.isdisjoint(set_d):
- d["year"] = note_datetime.year
- assert date.to_datetime(
- note_datetime=note_datetime, infer_from_context=True
- ) == TZ.localize(datetime(**d))
-
- # no day
- if {"month", "year"}.issubset(set_d) and {"day"}.isdisjoint(set_d):
- d["day"] = 1
- assert date.to_datetime(
- note_datetime=note_datetime, infer_from_context=True
- ) == TZ.localize(datetime(**d))
-
- # year only
- if {"year"}.issubset(set_d) and {"day", "month"}.isdisjoint(set_d):
- d["day"] = 1
- d["month"] = 1
- assert date.to_datetime(
- note_datetime=note_datetime, infer_from_context=True
- ) == TZ.localize(datetime(**d))
-
- # month only
- if {"month"}.issubset(set_d) and {"day", "year"}.isdisjoint(set_d):
- d["day"] = 1
- d["year"] = note_datetime.year
- assert date.to_datetime(
- note_datetime=note_datetime, infer_from_context=True
- ) == TZ.localize(datetime(**d))
-
else:
- assert date.to_datetime() is not None
+ assert date.to_duration()
+ assert date.to_datetime(note_datetime=note_datetime)
def test_periods(blank_nlp: Language):
@@ -224,9 +189,11 @@ def test_time(with_time: bool):
doc = nlp(text)
- assert len(doc.spans["dates"]) == len(entities)
+ spans = sorted(doc.spans["dates"] + doc.spans["durations"])
- for span, entity in zip(doc.spans["dates"], entities):
+ assert len(spans) == len(entities)
+
+ for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
norm = next(m.value for m in entity.modifiers if m.key == "norm")
assert span._.date.norm() == norm
@@ -254,7 +221,7 @@ def test_false_positives(blank_nlp: Language):
for example in counter_examples:
doc = blank_nlp(example)
- assert len(doc.spans["dates"]) == 0
+ assert len((*doc.spans["dates"], *doc.spans["durations"])) == 0
def test_dates_on_ents_only():
@@ -277,9 +244,11 @@ def test_dates_on_ents_only():
assert len(doc.ents) == 1
- assert len(doc.spans["dates"]) == len(entities)
+ spans = sorted(doc.spans["dates"] + doc.spans["durations"])
+
+ assert len(spans) == len(entities)
- for span, entity in zip(doc.spans["dates"], entities):
+ for span, entity in zip(spans, entities):
assert span.text == text[entity.start_char : entity.end_char]
@@ -290,5 +259,5 @@ def test_illegal_dates(blank_nlp):
)
for text in texts:
doc = blank_nlp(text)
- ent = doc.spans["dates"][0]
+ ent = sorted((*doc.spans["dates"], *doc.spans["durations"]))[0]
assert ent._.date.to_datetime() is None
diff --git a/tests/pipelines/misc/test_measurements.py b/tests/pipelines/misc/test_measurements.py
index 00f9b5e89..7383451c0 100644
--- a/tests/pipelines/misc/test_measurements.py
+++ b/tests/pipelines/misc/test_measurements.py
@@ -34,7 +34,7 @@ def test_default_factory(blank_nlp: Language):
blank_nlp.add_pipe("matcher", config=dict(terms={"patient": "patient"}))
blank_nlp.add_pipe(
"eds.measurements",
- config=dict(measurements=["eds.size", "eds.weight", "eds.bmi"]),
+ config=dict(measurements=["size", "weight", "bmi"]),
)
doc = blank_nlp(text)
@@ -94,15 +94,15 @@ def test_measure_label(blank_nlp: Language, matcher: MeasurementsMatcher):
m1, m2, m3, m4, m5, m6, m7, m8, m9 = doc.spans["measurements"]
- assert m1.label_ == "eds.size"
- assert m2.label_ == "eds.weight"
- assert m3.label_ == "eds.size"
- assert m4.label_ == "eds.size"
- assert m5.label_ == "eds.size"
- assert m6.label_ == "eds.size"
- assert m7.label_ == "eds.size"
- assert m8.label_ == "eds.size"
- assert m9.label_ == "eds.size"
+ assert m1.label_ == "size"
+ assert m2.label_ == "weight"
+ assert m3.label_ == "size"
+ assert m4.label_ == "size"
+ assert m5.label_ == "size"
+ assert m6.label_ == "size"
+ assert m7.label_ == "size"
+ assert m8.label_ == "size"
+ assert m9.label_ == "size"
def test_measure_str(blank_nlp: Language, matcher: MeasurementsMatcher):
@@ -214,21 +214,24 @@ def test_ranges(blank_nlp: Language, matcher: MeasurementsMatcher):
def test_merge_align(blank_nlp, matcher):
matcher.merge_mode = "align"
+ matcher.span_getter = {"candidates": True}
+ matcher.span_setter = {"ents": True}
doc = blank_nlp(text)
- ent = Span(doc, 10, 15, label="eds.size")
- doc.ents = [ent]
+ ent = Span(doc, 10, 15, label="size")
+ doc.spans["candidates"] = [ent]
doc = matcher(doc)
assert len(doc.ents) == 1
assert str(ent._.value) == "2.0 cm"
-def test_merge_intersect(blank_nlp, matcher):
+def test_merge_intersect(blank_nlp, matcher: MeasurementsMatcher):
matcher.merge_mode = "intersect"
- matcher.as_ents = True
+ matcher.span_setter = {**matcher.span_setter, "ents": True}
+ matcher.span_getter = {"lookup_zones": True}
doc = blank_nlp(text)
- ent = Span(doc, 10, 16, label="eds.size")
- doc.ents = [ent]
+ ent = Span(doc, 10, 16, label="size")
+ doc.spans["lookup_zones"] = [ent]
doc = matcher(doc)
assert len(doc.ents) == 2
diff --git a/tests/pipelines/ner/disorders/test_all.py b/tests/pipelines/ner/disorders/test_all.py
index 909873de2..490421c76 100644
--- a/tests/pipelines/ner/disorders/test_all.py
+++ b/tests/pipelines/ner/disorders/test_all.py
@@ -20,9 +20,9 @@
from tobacco import results_tobacco
results = dict(
- AIDS=results_aids,
- CKD=results_ckd,
- COPD=results_copd,
+ aids=results_aids,
+ ckd=results_ckd,
+ copd=results_copd,
alcohol=results_alcohol,
cerebrovascular_accident=results_cerebrovascular_accident,
congestive_heart_failure=results_congestive_heart_failure,
@@ -83,6 +83,9 @@ def check(self):
assert len(ents) == int(has_match)
+ for ent in ents:
+ assert ent.label_ == self.disorder
+
if not ents:
continue
diff --git a/tests/pipelines/ner/test_score.py b/tests/pipelines/ner/test_score.py
index cad10cfcf..6d0aab203 100644
--- a/tests/pipelines/ner/test_score.py
+++ b/tests/pipelines/ner/test_score.py
@@ -1,10 +1,6 @@
-# noqa: F401
import re
from edsnlp.pipelines.ner.scores import Score
-
-# from edsnlp.pipelines.ner.scores.charlson import patterns as charlson_terms
-# from edsnlp.pipelines.ner.scores.sofa import patterns as sofa_terms
from edsnlp.utils.examples import parse_example
example = """
@@ -17,25 +13,24 @@
testScore de 1.
TestScore de 0.
Testons également un autre score.
-SOFA maximum : 12.
+SOFA maximum : 12.
CR-URG.
-PRIORITE: 2: 2 - Urgence relative.
-GEMSA: (2) Patient non convoque sortant apres consultation
-CCMU: Etat clinique jugé stable avec actes diag ou thérapeutiques ( 2 )
+PRIORITE: 2: 2 - Urgence relative.
+GEMSA: (2) Patient non convoque sortant apres consultation
+CCMU: Etat clinique jugé stable avec actes diag ou thérapeutiques ( 2 )
CONCLUSION
La patiente est atteinte d'un carcinome mammaire infiltrant de type non spécifique, de grade 2 de malignité selon Elston et Ellis
-(architecture : 3 + noyaux : 3 + mitoses : 1).
+(architecture : 3 + noyaux : 3 + mitoses : 1).
""" # noqa: E501
def test_scores(blank_nlp):
-
blank_nlp.add_pipe(
"eds.normalizer",
config=dict(lowercase=True, accents=True, quotes=True, pollution=False),
@@ -47,6 +42,7 @@ def testscore_normalization(raw_score: str):
testscore = Score(
blank_nlp,
+ name="TestScore",
score_name="TestScore",
regex=[r"test+score"],
attr="NORM",
@@ -60,12 +56,12 @@ def testscore_normalization(raw_score: str):
text, entities = parse_example(example=example)
- blank_nlp.add_pipe("charlson")
- blank_nlp.add_pipe("SOFA")
- blank_nlp.add_pipe("eds.elston-ellis")
- blank_nlp.add_pipe("emergency.priority")
- blank_nlp.add_pipe("emergency.ccmu")
- blank_nlp.add_pipe("emergency.gemsa")
+ blank_nlp.add_pipe("eds.charlson")
+ blank_nlp.add_pipe("eds.sofa")
+ blank_nlp.add_pipe("eds.elston_ellis")
+ blank_nlp.add_pipe("eds.emergency_priority")
+ blank_nlp.add_pipe("eds.emergency_ccmu")
+ blank_nlp.add_pipe("eds.emergency_gemsa")
doc = blank_nlp(text)
doc = testscore(doc)
@@ -73,7 +69,6 @@ def testscore_normalization(raw_score: str):
for entity, ent in zip(entities, doc.ents):
for modifier in entity.modifiers:
-
assert (
getattr(ent._, modifier.key) == modifier.value
), f"{modifier.key} labels don't match."
diff --git a/tests/pipelines/ner/test_tnm.py b/tests/pipelines/ner/test_tnm.py
index af378b714..25b168f6d 100644
--- a/tests/pipelines/ner/test_tnm.py
+++ b/tests/pipelines/ner/test_tnm.py
@@ -17,7 +17,7 @@
def test_scores(blank_nlp):
- blank_nlp.add_pipe("eds.TNM")
+ blank_nlp.add_pipe("eds.tnm")
for example in examples:
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index c6bf38c56..f37604f63 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -6,3 +6,7 @@ def test_pipelines(doc):
assert anomalie._.negation
assert not doc[0]._.history
+
+
+def test_import_all():
+ import edsnlp.pipelines.factories # noqa: F401
diff --git a/tests/processing/test_processing.py b/tests/processing/test_processing.py
index c9f70e0a1..b8085a3e6 100644
--- a/tests/processing/test_processing.py
+++ b/tests/processing/test_processing.py
@@ -92,7 +92,7 @@ def model(lang):
nlp.add_pipe("eds.hypothesis")
nlp.add_pipe("eds.family")
nlp.add_pipe("eds.reported_speech")
- nlp.add_pipe("eds.SOFA")
+ nlp.add_pipe("eds.sofa")
nlp.add_pipe("eds.dates")
return nlp
diff --git a/tests/test_docs.py b/tests/test_docs.py
index f62bafcb5..6d239aaa6 100644
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@@ -1,23 +1,31 @@
-from itertools import chain
-from pathlib import Path
-
import pytest
+from extract_docs_code import extract_docs_code
+
+url_to_code = dict(extract_docs_code())
-from edsnlp.utils.blocs import check_md_file
-# @pytest.fixture(autouse=True, scope="module")
-# def brat_folder():
-# yield
-# shutil.rmtree("path/to/brat")
+def printer(code: str) -> None:
+ """
+ Prints a code bloc with lines for easier debugging.
+ Parameters
+ ----------
+ code : str
+ Code bloc.
+ """
+ lines = []
+ for i, line in enumerate(code.split("\n")):
+ lines.append(f"{i + 1:03} {line}")
-files = chain(
- Path("./").glob("*.md"),
- Path("docs").glob("**/*.md"),
-)
+ print("\n".join(lines))
# Note the use of `str`, makes for pretty output
-@pytest.mark.parametrize("path", files, ids=str)
-def test_code_blocks(path):
- check_md_file(path=path, memory=True)
+@pytest.mark.parametrize("url", sorted(url_to_code.keys()), ids=str)
+def test_code_blocks(url):
+ raw = url_to_code[url]
+ try:
+ exec(raw, {"__MODULE__": "__main__"})
+ except Exception:
+ printer(raw)
+ raise
diff --git a/tests/test_span_args.py b/tests/test_span_args.py
new file mode 100644
index 000000000..0b73681e6
--- /dev/null
+++ b/tests/test_span_args.py
@@ -0,0 +1,32 @@
+from pydantic import validate_arguments
+
+from edsnlp.pipelines.base import (
+ SpanGetterArg,
+ SpanSetterArg,
+ validate_span_getter,
+ validate_span_setter,
+)
+
+
+def test_span_getter():
+ assert validate_span_getter("ents") == {"ents": True}
+ assert validate_span_getter(["ents"]) == {"ents": True}
+ assert validate_span_getter(["ents", "group"]) == {"ents": True, "group": True}
+ assert validate_span_getter({"grp": True}) == {"grp": True}
+ assert validate_span_getter({"grp": ["a", "b", "c"]}) == {"grp": ["a", "b", "c"]}
+
+
+def test_span_setter():
+ assert validate_span_setter("ents") == {"ents": True}
+ assert validate_span_setter(["ents"]) == {"ents": True}
+ assert validate_span_setter(["ents", "group"]) == {"ents": True, "group": True}
+ assert validate_span_setter({"grp": True}) == {"grp": True}
+ assert validate_span_setter({"grp": ["a", "b", "c"]}) == {"grp": ["a", "b", "c"]}
+
+
+def test_validate_args():
+ @validate_arguments
+ def my_func(span_getter: SpanGetterArg, span_setter: SpanSetterArg):
+ return span_getter, span_setter
+
+ assert my_func("ents", "ents") == ({"ents": True}, {"ents": True})