diff --git a/README.md b/README.md index ff8a58bd..b45abcf7 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,6 @@ # EDS-TeVa [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/aphp/edsteva/HEAD?labpath=notebooks%2Fsynthetic_data.ipynb)

- - Tests - Documentation @@ -59,7 +56,7 @@ pip install edsteva We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ``` -pip install edsteva==0.1.1 +pip install edsteva==0.1.2 ``` ## Example diff --git a/changelog.md b/changelog.md index d93f7fc5..da41ab59 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,5 @@ # Changelog -## v0.1.2 - 13-12-2022 +## v0.1.2 - 14-12-2022 - ConditionProbe computes the availability of administrative data related to visits with at least one ICD-10 code recorded. ## v0.1.1 - 03-12-2022 diff --git a/docs/index.md b/docs/index.md index 653c6cc3..6d514f2e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,9 +10,6 @@

- - Tests - Documentation @@ -97,7 +94,7 @@ color:green Successfully installed edsteva We recommend pinning the library version in your projects, or use a strict package manager like [Poetry](https://python-poetry.org/). ``` -pip install edsteva==0.1.1 +pip install edsteva==0.1.2 ``` ## Working example: administrative records relative to visits diff --git a/edsteva/__init__.py b/edsteva/__init__.py index 8b911e6b..253455c1 100644 --- a/edsteva/__init__.py +++ b/edsteva/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.1" +__version__ = "0.1.2" import importlib diff --git a/edsteva/io/i2b2_mapping.py b/edsteva/io/i2b2_mapping.py index 1a91b7e7..0053a7c6 100644 --- a/edsteva/io/i2b2_mapping.py +++ b/edsteva/io/i2b2_mapping.py @@ -16,7 +16,7 @@ def get_i2b2_table( spark_session: SparkSession, db_name: str, db_source: str, table: str -) -> SparkDataFrame: +) -> SparkDataFrame: # pragma: no cover """ Retrieve a Spark table in i2b2 and transform it to fit with OMOP standard. @@ -161,7 +161,9 @@ def get_i2b2_table( return df -def mapping_dict(mapping: Dict[str, str], Non_renseigne: str) -> FunctionUDF: +def mapping_dict( + mapping: Dict[str, str], Non_renseigne: str +) -> FunctionUDF: # pragma: no cover """ Returns a function that maps data according to a mapping dictionnary in a Spark DataFrame. diff --git a/edsteva/io/synthetic/synthetic.py b/edsteva/io/synthetic/synthetic.py index 056f697b..951e0151 100644 --- a/edsteva/io/synthetic/synthetic.py +++ b/edsteva/io/synthetic/synthetic.py @@ -1,10 +1,11 @@ from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, Tuple +from typing import Dict, List, Tuple, Union import numpy as np import pandas as pd from databricks import koalas as ks +from loguru import logger from edsteva.io.synthetic.care_site import generate_care_site_tables from edsteva.io.synthetic.utils import recursive_items @@ -16,6 +17,8 @@ generate_before_t0, ) +DataFrame = Union[ks.DataFrame, pd.DataFrame] + CARE_SITE_STRUCTURE = { "Hôpital-1": { "Pôle/DMU-11": { @@ -323,14 +326,6 @@ def generate(self): visit_detail = self._generate_visit_detail(visit_occurrence) note = self._generate_note(hospital_ids, visit_occurrence) - self.available_tables = [ - "care_site", - "visit_occurrence", - "condition_occurrence", - "fact_relationship", - "visit_detail", - "note", - ] self.care_site = care_site self.visit_occurrence = visit_occurrence self.condition_occurrence = condition_occurrence @@ -338,6 +333,8 @@ def generate(self): self.visit_detail = visit_detail self.note = note + self.list_available_tables() + if self.module == "koalas": self.convert_to_koalas() return self @@ -523,3 +520,18 @@ def reset_to_pandas(self): self.visit_detail = self.visit_detail.to_pandas() self.note = self.note.to_pandas() self.module = "pandas" + + def delete_table(self, table_name: str) -> None: + if hasattr(self, table_name): + delattr(self, table_name) + logger.info("Table {} has been deleted", table_name) + else: + logger.info("Table {} does not exist", table_name) + self.list_available_tables() + + def list_available_tables(self) -> List[str]: + available_tables = [] + for key, item in self.__dict__.items(): + if isinstance(item, DataFrame.__args__): + available_tables.append(key) + self.available_tables = available_tables diff --git a/edsteva/probes/note.py b/edsteva/probes/note.py index 4034f77a..4bda746d 100644 --- a/edsteva/probes/note.py +++ b/edsteva/probes/note.py @@ -85,7 +85,7 @@ def get_uf_visit( visit_detail, care_site, care_site_relationship, -): +): # pragma: no cover # Load Orbis note and Uf for Note note_orbis = extra_data.orbis_document[ [ @@ -151,7 +151,7 @@ def get_uf_visit( return uf_visit -def get_pole_visit(uf_visit, care_site, care_site_relationship): +def get_pole_visit(uf_visit, care_site, care_site_relationship): # pragma: no cover pole_visit = convert_table_to_pole( table=uf_visit.drop(columns=["care_site_short_name", "care_site_level"]), diff --git a/edsteva/probes/utils.py b/edsteva/probes/utils.py index 7a16009f..f1361079 100644 --- a/edsteva/probes/utils.py +++ b/edsteva/probes/utils.py @@ -24,6 +24,18 @@ def prepare_visit_occurrence(data, start_date, end_date, stay_types): + check_columns( + data.visit_occurrence, + required_columns=[ + "visit_occurrence_id", + "visit_source_value", + "visit_start_datetime", + "care_site_id", + "row_status_source_value", + "visit_occurrence_source_value", + ], + df_name="visit_occurrence", + ) visit_occurrence = data.visit_occurrence[ [ "visit_occurrence_id", diff --git a/edsteva/utils/checks.py b/edsteva/utils/checks.py index 211f5dfb..4ea5c98b 100644 --- a/edsteva/utils/checks.py +++ b/edsteva/utils/checks.py @@ -38,7 +38,6 @@ class MissingTableError(Exception): def __init__( self, required_tables: Union[List, dict], - data_name: str = "", ): if isinstance(required_tables, dict): @@ -50,13 +49,7 @@ def __init__( to_display_per_concept = [f"- {concept}" for concept in required_tables] str_to_display = "\n".join(to_display_per_concept) - if data_name: - data_name = f" {data_name} " - message = ( - f"The{data_name}Data is missing some tables, " - "namely:\n" - f"{str_to_display}" - ) + message = f"Data is missing some tables, namely:\n {str_to_display}" super().__init__(message) @@ -68,8 +61,8 @@ def check_columns(df: DataFrame, required_columns: List[str], df_name: str = "") raise MissingColumnError(missing_columns, df_name=df_name) -def check_tables(data: Data, required_tables: List[str], data_name: str = ""): +def check_tables(data: Data, required_tables: List[str]): present_tables = set(data.available_tables) missing_tables = set(required_tables) - present_tables if missing_tables: - raise MissingTableError(missing_tables, data_name=data_name) + raise MissingTableError(missing_tables) diff --git a/pyproject.toml b/pyproject.toml index 757758fd..8c013522 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "edsteva" -version = "0.1.1" +version = "0.1.2" description = "EDS-TeVa provides a set of tools that aims at modeling the adoption over time and across space of the Electronic Health Records." authors = ["Adam Remaki ", "Vicent Maladiere ", "Benoit Playe ", "Romain Bey ", "Paul Bernard "] keywords = ["OMOP", "Data Analysis", "Electronic health record"] diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 00000000..5b514e9b --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,64 @@ +import pandas as pd +import pytest +from databricks import koalas as ks + +from edsteva.utils import framework + + +@pytest.fixture() +def example_objects(): + return dict( + pandas=[ + pd.DataFrame({"col": [1, 2, 3]}), + pd.Series([4, 5, 6]), + ], + koalas=[ + ks.DataFrame({"val": [7, 8, 9]}), + ks.Series([10, 11, 12]), + ], + ) + + +def test_identify_pandas(example_objects): + for obj in example_objects["pandas"]: + assert framework.is_pandas(obj) is True + assert framework.is_koalas(obj) is False + assert framework.get_framework(obj) is pd + + +def test_identify_koalas(example_objects): + for obj in example_objects["koalas"]: + assert framework.is_pandas(obj) is False + assert framework.is_koalas(obj) is True + assert framework.get_framework(obj) is ks + + +def test_framework_pandas(example_objects): + for obj in example_objects["pandas"]: + converted = framework.pandas(obj) + assert converted is obj + + for obj in example_objects["koalas"]: + converted = framework.pandas(obj) + assert framework.is_pandas(converted) is True + + +def test_framework_koalas(example_objects): + for obj in example_objects["pandas"]: + converted = framework.koalas(obj) + assert framework.is_koalas(converted) is True + + for obj in example_objects["koalas"]: + converted = framework.koalas(obj) + assert converted is obj + + +def test_unconvertible_objects(): + objects = [1, "coucou", {"a": [1, 2]}, [1, 2, 3], 2.5, ks, pd] + for obj in objects: + with pytest.raises(ValueError): + framework.pandas(obj) + + for obj in objects: + with pytest.raises(ValueError): + framework.koalas(obj) diff --git a/tests/test_model.py b/tests/test_model.py index bcb38311..d0c6b672 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -3,6 +3,7 @@ import pandas as pd import pytest +from edsteva import CACHE_DIR from edsteva.io import SyntheticData from edsteva.metrics import error, error_after_t0 from edsteva.models.rectangle_function import RectangleFunction @@ -47,6 +48,16 @@ def test_step_function_visit_occurence(): start_date=data.t_min, end_date=data.t_max, ) + + # Test Cache saving + visit_model.save() + assert os.path.isfile(CACHE_DIR / "edsteva" / "models" / "stepfunction.pickle") + visit_model = StepFunction() + visit_model.load() + visit_model.delete() + assert not os.path.isfile(CACHE_DIR / "edsteva" / "models" / "stepfunction.pickle") + + # Test target saving visit_model.save( path="test.pickle", ) diff --git a/tests/test_probes.py b/tests/test_probes.py index 6c22f6bd..65ba8c5c 100644 --- a/tests/test_probes.py +++ b/tests/test_probes.py @@ -3,15 +3,43 @@ import pytest -from edsteva import improve_performances +from edsteva import CACHE_DIR, improve_performances from edsteva.io import SyntheticData from edsteva.probes import ConditionProbe, NoteProbe, VisitProbe +from edsteva.utils.checks import MissingColumnError, MissingTableError pytestmark = pytest.mark.filterwarnings("ignore") + improve_performances() data_step = SyntheticData(seed=41, mode="step").generate() data_rect = SyntheticData(seed=41, mode="rect").generate() +data_missing = SyntheticData(seed=41, mode="step").generate() + + +def test_missing_checks(): + with pytest.raises(TypeError): + data_fake = [1, 2, 3] + visit = VisitProbe() + visit.compute( + data=data_fake, + ) + with pytest.raises(MissingColumnError): + data_missing.visit_occurrence = data_missing.visit_occurrence.drop( + columns="visit_occurrence_id" + ) + visit = VisitProbe() + visit.compute( + data=data_missing, + ) + with pytest.raises(MissingTableError): + data_missing.delete_table("unknown_table") # Test typo + data_missing.delete_table("fact_relationship") + visit = VisitProbe() + visit.compute( + data=data_missing, + ) + params = [ dict( @@ -80,7 +108,18 @@ def test_compute_visit_probe(data, params): care_site_ids=params["care_site_ids"], care_site_short_names=params["care_site_short_names"], ) + if params["test_save"]: + # Test Cache saving + visit.save() + assert os.path.isfile(CACHE_DIR / "edsteva" / "probes" / "visitprobe.pickle") + visit = VisitProbe() + visit.load() + visit.delete() + assert not os.path.isfile( + CACHE_DIR / "edsteva" / "probes" / "visitprobe.pickle" + ) + visit.save( path="test.pickle", )