diff --git a/changelog.md b/changelog.md index 72a9f72c8..bc3a36910 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,10 @@ ## Unreleased +### Added + +- `eds.tables` accepts a minimum_table_size (default 2) argument to reduce pollution + ### Fixed - Numbers are now only detected without trying to remove the pollution in between digits, ie `55 @ 77777` could be detected as a full number before, but not anymore. diff --git a/edsnlp/pipes/misc/tables/patterns.py b/edsnlp/pipes/misc/tables/patterns.py index 919143f60..588f2831e 100644 --- a/edsnlp/pipes/misc/tables/patterns.py +++ b/edsnlp/pipes/misc/tables/patterns.py @@ -1,2 +1,2 @@ sep = ["¦", "|"] -regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n)+"] +regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n){{{n},}}"] diff --git a/edsnlp/pipes/misc/tables/tables.py b/edsnlp/pipes/misc/tables/tables.py index ae57300d4..ede5a91b4 100644 --- a/edsnlp/pipes/misc/tables/tables.py +++ b/edsnlp/pipes/misc/tables/tables.py @@ -111,6 +111,8 @@ class TablesMatcher(BaseComponent): sep_pattern : Optional[str] The regex pattern to identify the separator pattern. Used when calling `to_pd_table`. + min_rows : Optional[int] + Only tables with more then `min_rows` lines will be detected. attr : str spaCy's attribute to use: a string with the value "TEXT" or "NORM", or a dict with @@ -130,6 +132,7 @@ def __init__( *, tables_pattern: Optional[AsList[str]] = None, sep_pattern: Optional[AsList[str]] = None, + min_rows: int = 2, attr: Union[Dict[str, str], str] = "TEXT", ignore_excluded: bool = True, ): @@ -145,7 +148,7 @@ def __init__( "table", list( dict.fromkeys( - template.format(sep=re.escape(sep)) + template.format(sep=re.escape(sep), n=re.escape(str(min_rows))) for sep in sep_pattern for template in tables_pattern ) diff --git a/tests/pipelines/misc/test_tables.py b/tests/pipelines/misc/test_tables.py index d147c7981..4eb00d9ee 100644 --- a/tests/pipelines/misc/test_tables.py +++ b/tests/pipelines/misc/test_tables.py @@ -17,6 +17,13 @@ 2/2Pat : | | |Intitulé RCP +Table de taille <= 3 : + + |Libellé | Unité | Valeur | Intervalle | + |Leucocytes |x10*9/L |4.97 | 4.09-11 | + +qdfsdf + |Libellé | Unité | Valeur | Intervalle | |Leucocytes |x10*9/L |4.97 | 4.09-11 | |Hématies |x10*12/L|4.68 | 4.53-5.79 | @@ -35,7 +42,7 @@ def test_tables(blank_nlp): if blank_nlp.lang != "eds": pytest.skip("Test only for eds language") blank_nlp.add_pipe("eds.normalizer") - blank_nlp.add_pipe("eds.tables") + blank_nlp.add_pipe("eds.tables", config=dict(min_rows=3)) doc = blank_nlp(TEXT)