Skip to content

Commit

Permalink
adding min_rows argument to eds.tables
Browse files Browse the repository at this point in the history
  • Loading branch information
svittoz committed Sep 3, 2024
1 parent 1b64065 commit a8f9380
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Added

- `eds.tables` accepts a minimum_table_size (default 2) argument to reduce pollution

### Fixed

- Numbers are now only detected without trying to remove the pollution in between digits, ie `55 @ 77777` could be detected as a full number before, but not anymore.
Expand Down
2 changes: 1 addition & 1 deletion edsnlp/pipes/misc/tables/patterns.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sep = ["¦", "|"]
regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n)+"]
regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n){{{n},}}"]
5 changes: 4 additions & 1 deletion edsnlp/pipes/misc/tables/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ class TablesMatcher(BaseComponent):
sep_pattern : Optional[str]
The regex pattern to identify the separator pattern.
Used when calling `to_pd_table`.
min_rows : Optional[int]
Only tables with more then `min_rows` lines will be detected.
attr : str
spaCy's attribute to use:
a string with the value "TEXT" or "NORM", or a dict with
Expand All @@ -130,6 +132,7 @@ def __init__(
*,
tables_pattern: Optional[AsList[str]] = None,
sep_pattern: Optional[AsList[str]] = None,
min_rows: int = 2,
attr: Union[Dict[str, str], str] = "TEXT",
ignore_excluded: bool = True,
):
Expand All @@ -145,7 +148,7 @@ def __init__(
"table",
list(
dict.fromkeys(
template.format(sep=re.escape(sep))
template.format(sep=re.escape(sep), n=re.escape(str(min_rows)))
for sep in sep_pattern
for template in tables_pattern
)
Expand Down
9 changes: 8 additions & 1 deletion tests/pipelines/misc/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
2/2Pat : <NOM> <Prenom> |<date> | <ipp> |Intitulé RCP
Table de taille <= 3 :
|Libellé | Unité | Valeur | Intervalle |
|Leucocytes |x10*9/L |4.97 | 4.09-11 |
qdfsdf
|Libellé | Unité | Valeur | Intervalle |
|Leucocytes |x10*9/L |4.97 | 4.09-11 |
|Hématies |x10*12/L|4.68 | 4.53-5.79 |
Expand All @@ -35,7 +42,7 @@ def test_tables(blank_nlp):
if blank_nlp.lang != "eds":
pytest.skip("Test only for eds language")
blank_nlp.add_pipe("eds.normalizer")
blank_nlp.add_pipe("eds.tables")
blank_nlp.add_pipe("eds.tables", config=dict(min_rows=3))

doc = blank_nlp(TEXT)

Expand Down

0 comments on commit a8f9380

Please sign in to comment.