From e9a510409291bbeffb77493f39483ac14a3c9acc Mon Sep 17 00:00:00 2001 From: Thomas PETIT-JEAN Date: Mon, 5 Sep 2022 11:11:04 +0200 Subject: [PATCH] doc: update doc for new contextual matcher --- docs/pipelines/core/contextual-matcher.md | 58 ++++++++++++++++++----- docs/pipelines/ner/score.md | 17 ++++--- mkdocs.yml | 1 + notebooks/pipeline.md | 2 +- 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/docs/pipelines/core/contextual-matcher.md b/docs/pipelines/core/contextual-matcher.md index 305bbf90d..6037b20fc 100644 --- a/docs/pipelines/core/contextual-matcher.md +++ b/docs/pipelines/core/contextual-matcher.md @@ -76,13 +76,15 @@ cancer = dict( name="stage", regex=stage, window=(-10,10), - expand_entity=False, + replace_entity=True, + reduce_mode=None, ), dict( name="metastase", regex=metastase, window=10, - expand_entity=True, + replace_entity=False, + reduce_mode="keep_fast", ), ] ) @@ -110,6 +112,36 @@ In this case, the configuration can be concatenated in a list: patterns = [cancer, lymphome] ``` +## Available parameters for more flexibility + +3 main parameters can be used to refine how entities will be formed + +### The `include_assigned` parameter + +Following the previous example, you might want your extracted entities to **include**, if found, the cancer stage and the metastasis status. This can be achieved by setting `include_assigned=True` in the pipe configuration. + +For instance, from the sentence "Le patient a un cancer au stade 3", the extracted entity will be: + +- "cancer" if `include_assigned=True` +- "cancer au stade 3" if `include_assigned=True` + +### The `reduce_mode` parameter + +It might happend that an assign matches multiple times: For instance, in the (non-sensical) sentence "Le patient a un cancer au stade 3 et au stade 4", both "stade 3" and "stade 4" will be matched by the `stage` assign key. Depending on your use case, you may want to keep all extractions, or only a signle one. + +- If `reduce_mode=None` (default), all extractions are kept in a list +- If `reduce_mode="keep_first"`, only the extraction closest to the main matched entity will be kept (in this case, it would be "stade 3" since it is the closest to "cancer") +- If `reduce_mode=="keep_last"`, only the furthest extraction is kept. + +### The `replace_entity` parameter + +This parameter can be se to `True` **only for a single assign key per dictionary**. This limitation comes from the purpose of this parameter: If set to `True`, the corresponding `assign` key will be returned as the entity, instead of the match itself. For clarity, let's take the same sentence "Le patient a un cancer au stade 3" as an example: + +- if `replace_entity=True` in the `stage` assign key, then the extracted entity will be "stade 3" instead of "cancer" +- if `replace_entity=False` for every assign key, the returned entity will be, as expected, "cancer" + +**Please notice** that with `replace_entity` set to True, if the correponding assign key matches nothing, the entity will be discarded. + ## Usage @@ -185,13 +217,14 @@ Let us see what we can get from this pipeline with a few examples The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| ----------------- | ------------------------------------------------------------------------------------------------------------------------ | -------------------- | -| `patterns` | Dictionary or List of dictionaries. See below | | -| `assign_as_span` | Whether to store eventual extractions defined via the `assign` key as Spans or as string | False | -| `attr` | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"` | -| `ignore_excluded` | Whether to skip excluded tokens during matching | `False` | -| `regex_flags` | RegExp flags to use when matching, filtering and assigning (See [here](https://docs.python.org/3/library/re.html#flags)) | 0 (use default flag) | +| Parameter | Explanation | Default | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------ | -------------------- | +| `patterns` | Dictionary or List of dictionaries. See below | | +| `assign_as_span` | Whether to store eventual extractions defined via the `assign` key as Spans or as string | False | +| `attr` | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"` | +| `ignore_excluded` | Whether to skip excluded tokens during matching | `False` | +| `include_assigned` | Whether to include (eventuals) assign matches to the final entity | `False` | +| `regex_flags` | RegExp flags to use when matching, filtering and assigning (See [here](https://docs.python.org/3/library/re.html#flags)) | 0 (use default flag) | However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries** @@ -255,10 +288,13 @@ A patterr is a nested dictionary with the following keys: A dictionary where keys are labels and values are **Regexes with a single capturing group** - === "`expand_entity`" + === "`replace_entity`" + + If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. See [this paragraph][the-replace_entity-parameter] - If set to `True`, the initial entity's span will be expanded to the furthest match from the `regex` dictionary + === "`reduce_mode`" + Set how multiple assign matches are handled. See [this paragraph][the-reduce_mode-parameter] ### A full pattern dictionary example diff --git a/docs/pipelines/ner/score.md b/docs/pipelines/ner/score.md index bd270a2a5..132438212 100644 --- a/docs/pipelines/ner/score.md +++ b/docs/pipelines/ner/score.md @@ -23,7 +23,7 @@ text = "Charlson à l'admission: 7.\n" "Charlson: \n" "OMS: \n" doc = nlp(text) doc.ents -# Out: (7,) +# Out: (Charlson à l'admission: 7,) ``` We can see that only one occurrence was extracted. The second mention of Charlson in the text @@ -57,7 +57,7 @@ text = "SOFA (à 24H) : 12.\n" "OMS: \n" doc = nlp(text) doc.ents -# Out: (12,) +# Out: (SOFA (à 24H) : 12.,) ``` Each extraction exposes 3 extensions: @@ -97,9 +97,12 @@ doc.ents ent = doc.ents[0] ent._.value.dict() # {'modifier': 'p', -# 'tumour': 'x', -# 'node': 1, -# 'metastasis': 1, +# 'tumour': None, +# 'tumour_specification': 'x', +# 'node': '1', +# 'node_specification': None, +# 'metastasis': '1', +# 'resection_completeness': None, # 'version': None, # 'version_year': None} ``` @@ -113,7 +116,7 @@ The configuration consists of 4 items: - `score_name`: The name of the score - `regex`: A list of regular expression to detect the score's mention -- `after_extract`: A regular expression to extract the score's value after the score's mention +- `value_extract`: A regular expression to extract the score's value in the context of the score's mention - `score_normalization`: A function name used to normalise the score's _raw_ value !!! note @@ -149,7 +152,7 @@ def score_normalization(extracted_score): charlson_config = dict( score_name="charlson", regex=[r"charlson"], - after_extract=r"charlson.*[\n\W]*(\d+)", + value_extract=r"charlson.*[\n\W]*(\d+)", score_normalization="score_normalization.charlson", ) ``` diff --git a/mkdocs.yml b/mkdocs.yml index fa5560591..c2fafd4d2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -115,6 +115,7 @@ extra_javascript: plugins: - search + - autorefs - bibtex: bib_file: "docs/references.bib" - gen-files: diff --git a/notebooks/pipeline.md b/notebooks/pipeline.md index bf48158a0..570c07470 100644 --- a/notebooks/pipeline.md +++ b/notebooks/pipeline.md @@ -111,7 +111,7 @@ def score_normalization(extracted_score): charlson_config = dict( score_name = 'charlson', regex = [r'charlson'], - after_extract = r"(\d+)", + value_extract = r"(\d+)", score_normalization = "score_normalization.charlson" )