diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index a1e13b41..859e6887 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -49,6 +49,20 @@ jobs: needs: check_skip if: ${{ needs.check_skip.outputs.skip == 'false' }} runs-on: "ubuntu-latest" + strategy: + fail-fast: true + matrix: + include: + - python-version: "3.7" + spark: "spark2" + - python-version: "3.7" + spark: "spark3" + - python-version: "3.8" + spark: "spark3" + - python-version: "3.9" + spark: "spark3" + - python-version: "3.10" + spark: "spark3" name: 'Testing on ubuntu' defaults: run: @@ -61,13 +75,15 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.7' - - name: Install eds-scikit - shell: bash {0} - run: ./build_tools/github/install.sh - - name: Run tests - shell: bash {0} - run: ./build_tools/github/test.sh + python-version: ${{ matrix.python-version }} + cache: 'pip' + - name: Install dependencies + run: | + pip install -U pip wheel + pip install --progress-bar off ".[${{ matrix.spark }}, dev, doc]" + - name: Run pytest + run: | + python -m pytest --pyargs tests -m "" --cov=eds_scikit - name: Upload coverage to CodeCov uses: codecov/codecov-action@v3 if: success() diff --git a/build_tools/github/install.sh b/build_tools/github/install.sh deleted file mode 100755 index 4612b415..00000000 --- a/build_tools/github/install.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -e - -pip install -U "pip<23" -pip install --progress-bar off --upgrade ".[dev, doc]" diff --git a/build_tools/github/test.sh b/build_tools/github/test.sh deleted file mode 100755 index f28f9358..00000000 --- a/build_tools/github/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -x - -pip install -U "pip<23" -python -m pytest --pyargs tests -m "" --cov=eds_scikit diff --git a/changelog.md b/changelog.md index fd08949a..b6748545 100644 --- a/changelog.md +++ b/changelog.md @@ -1,10 +1,18 @@ # Changelog ## Unreleased + +### Added + +- Support for pyarrow > 0.17.0 +- Support for Python 3.7 to 3.10 (3.11 or higher is not tested) +- Support for pyspark 3 (to force pyspark 2, use `pip install eds-scikit[spark2]`) + ### Fixed - Caching in spark instead of koalas to improve speed ## v0.1.6 (2023-09-27) + ### Added - Module ``event_sequences`` to visualize individual sequences of events. - Module ``age_pyramid`` to quickly visualize the age and gender distributions in a cohort. diff --git a/docs/generate_reference.py b/docs/generate_reference.py index 2be7cf8f..b3863353 100644 --- a/docs/generate_reference.py +++ b/docs/generate_reference.py @@ -8,7 +8,7 @@ for path in sorted(Path("eds_scikit").rglob("*.py")): print(path) - if ".ipynb_checkpoints" in path.parts: + if ".ipynb_checkpoints" in path.parts or "package-override" in path.parts: continue module_path = path.relative_to(".").with_suffix("") doc_path = path.relative_to("eds_scikit").with_suffix(".md") diff --git a/docs/project_description.md b/docs/project_description.md index 97fb139f..1d1321b7 100644 --- a/docs/project_description.md +++ b/docs/project_description.md @@ -124,6 +124,7 @@ The goal of **Koalas** is precisely to avoid this issue. It aims at allowing cod ```python from databricks import koalas as ks +# or from pyspark import pandas as ks, if you have spark 3 # Converting the Spark DataFrame into a Koalas DataFrame visit_occurrence_koalas = visit_occurrence_spark.to_koalas() diff --git a/eds_scikit/__init__.py b/eds_scikit/__init__.py index 52cc3140..72070e42 100644 --- a/eds_scikit/__init__.py +++ b/eds_scikit/__init__.py @@ -11,6 +11,7 @@ import importlib import os +import pathlib import sys import time from packaging import version @@ -19,15 +20,20 @@ import pandas as pd import pyarrow +import pyarrow.ipc import pyspark from loguru import logger from pyspark import SparkContext from pyspark.sql import SparkSession -import eds_scikit.biology # noqa: F401 --> To register functions +pyarrow.open_stream = pyarrow.ipc.open_stream -import eds_scikit.utils.logging +sys.path.insert( + 0, (pathlib.Path(__file__).parent / "package-override").absolute().as_posix() +) +os.environ["PYTHONPATH"] = ":".join(sys.path) +import eds_scikit.biology # noqa: F401 --> To register functions # Remove SettingWithCopyWarning pd.options.mode.chained_assignment = None diff --git a/eds_scikit/biology/utils/config.py b/eds_scikit/biology/utils/config.py index ede16420..ec29e0de 100644 --- a/eds_scikit/biology/utils/config.py +++ b/eds_scikit/biology/utils/config.py @@ -1,9 +1,9 @@ import glob +import os from pathlib import Path from typing import List import pandas as pd -from importlib_metadata import os from loguru import logger from eds_scikit.biology.utils.process_concepts import ConceptsSet diff --git a/eds_scikit/package-override/databricks/__init__.py b/eds_scikit/package-override/databricks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eds_scikit/package-override/databricks/koalas/__init__.py b/eds_scikit/package-override/databricks/koalas/__init__.py new file mode 100644 index 00000000..b99906b4 --- /dev/null +++ b/eds_scikit/package-override/databricks/koalas/__init__.py @@ -0,0 +1,17 @@ +# This file is used to override the databricks.koalas package with the pyspark.pandas +# package, if the databricks.koalas package is not available (python >= 3.8) +import sys +import pyarrow # noqa: E402, F401 + +old_sys_path = sys.path.copy() +sys.path.remove(next((p for p in sys.path if "package-override" in p), None)) +databricks = sys.modules.pop("databricks") +sys.modules.pop("databricks.koalas") +try: + from databricks.koalas import * # noqa: E402, F401, F403 +except ImportError: + from pyspark.pandas import * # noqa: E402, F401, F403 + + sys.modules["databricks"] = databricks + sys.modules["databricks.koalas"] = sys.modules["pyspark.pandas"] +sys.path[:] = old_sys_path diff --git a/eds_scikit/package-override/pyarrow/__init__.py b/eds_scikit/package-override/pyarrow/__init__.py new file mode 100644 index 00000000..c2200a55 --- /dev/null +++ b/eds_scikit/package-override/pyarrow/__init__.py @@ -0,0 +1,37 @@ +""" +PySpark 2 needs pyarrow.open_stream, which was deprecated in 0.17.0 in favor of +pyarrow.ipc.open_stream. Here is the explanation of how we monkey-patch pyarrow +to add back pyarrow.open_stream for versions > 0.17 and how we make this work with +pyspark distributed computing : +1. We add this fake eds_scikit/package-override/pyarrow package to python lookup list + (the PYTHONPATH env var) in eds_scikit/__init__.py : this env variable will be shared + with the executors +2. When an executor starts and import packages, it looks in the packages by inspecting + the paths in PYTHONPATH. It finds our fake pyarrow package first an executes the + current eds_scikit/package-override/pyarrow/__init__.py file +3. In this file, we remove the fake pyarrow package path from the lookup list, unload + the current module from python modules cache (sys.modules) and re-import pyarrow + => the executor's python will this time load the true pyarrow and store it in + sys.modules. Subsequent "import pyarrow" calls will return the sys.modules["pyarrow"] + value, which is the true pyarrow module. +4. We are not finished: we add back the deprecated "open_stream" function that was + removed in pyarrow 0.17.0 (the reason for all this hacking) by setting it + on the true pyarrow module +5. We still export the pyarrow module content (*) such that the first import, which + is the only one that resolves to this very module, still gets what it asked for: + the pyarrow module's content. +""" +import sys + +old_sys_path = sys.path.copy() +sys.path.remove(next((p for p in sys.path if "package-override" in p), None)) +del sys.modules["pyarrow"] + +import pyarrow # noqa: E402, F401 +from pyarrow.ipc import open_stream # noqa: E402, F401 + +pyarrow.open_stream = open_stream + +from pyarrow import * # noqa: F401, F403, E402 + +sys.path[:] = old_sys_path diff --git a/eds_scikit/utils/test_utils.py b/eds_scikit/utils/test_utils.py index c0403e8a..53520964 100644 --- a/eds_scikit/utils/test_utils.py +++ b/eds_scikit/utils/test_utils.py @@ -96,7 +96,7 @@ def assert_equal( return output -def assert_images_equal(image_1: str, image_2: str): +def image_diff(image_1: str, image_2: str): img1 = Image.open(image_1) img2 = Image.open(image_2) @@ -105,12 +105,11 @@ def assert_images_equal(image_1: str, image_2: str): img2 = img2.resize(img1.size) sum_sq_diff = np.sum( - (np.asarray(img1).astype("float") - np.asarray(img2).astype("float")) ** 2 - ) - - if sum_sq_diff == 0: - # Images are exactly the same - pass - else: - normalized_sum_sq_diff = sum_sq_diff / np.sqrt(sum_sq_diff) - assert normalized_sum_sq_diff < 0.001 + ( + np.asarray(img1).astype("float") / 255 + - np.asarray(img2).astype("float") / 255 + ) + ** 2 + ) / np.prod(img1.size) + + return sum_sq_diff diff --git a/pyproject.toml b/pyproject.toml index 0a5310c2..30c30b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,24 +30,22 @@ classifiers = [ "License :: OSI Approved :: BSD License", "Operating System :: Unix", ] -requires-python = ">=3.7.1,<3.8" +requires-python = ">=3.7.1" dependencies = [ "pgpasslib>=1.1.0, <2.0.0", - "psycopg2-binary>=2.9.0, <3.0.0", + "psycopg2-binary>=2.9.0", "pandas>=1.3.0, <2.0.0", - "numpy>=1.0.0, <1.20", - "koalas>=1.8.1, <2.0.0", - "altair>=5.0.0, <6.0.0", + "numpy>=1.0.0", + "altair>=5.0.0", "loguru==0.7.0", - "pypandoc==1.7.5", - "pyspark==2.4.3", - "pyarrow==0.17.0", #"pyarrow>=0.10, <0.17.0", + "pyspark", + "pyarrow>=0.10.0", "pretty-html-table>=0.9.15, <0.10.0", "catalogue", "schemdraw>=0.15.0, <1.0.0", - "ipython>=7.32.0, <8.0.0", - "packaging==21.3", - "tomli==2.0.1", + "ipython>=7.32.0", + "packaging>=21.3", + "tomli>=2.0.1", ] dynamic = ['version'] @@ -66,6 +64,10 @@ Documentation = "https://aphp.github.io/eds-scikit" "Bug Tracker" = "https://github.com/aphp/eds-scikit/issues" [project.optional-dependencies] +spark2 = [ + "pyspark==2.4.3", + "koalas>=1.8.1,<2.0.0", +] dev = [ "black>=22.3.0, <23.0.0", "flake8==3.9.2", diff --git a/tests/conftest.py b/tests/conftest.py index ca57ef17..4c3ba10f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,15 @@ +# isort: skip_file import logging import os +import eds_scikit import pandas as pd import pytest from _pytest.logging import caplog as _caplog # noqa F401 from databricks import koalas as ks from loguru import logger +import eds_scikit.utils.logging # noqa: F401 from eds_scikit import improve_performances from . import test_registry # noqa: F401 --> To register functions @@ -83,11 +86,6 @@ def spark_session(pytestconfig, tmpdir_factory): SparkConf() .setMaster("local") .setAppName("testing") - # used to overwrite hive tables - .set( - "spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", - "true", - ) # Path to data and metastore # Note: the option "hive.metastore.warehouse.dir" is deprecated # But javax.jdo.option.ConnectionURL can be used for the path of 'metastrore_db' @@ -100,6 +98,7 @@ def spark_session(pytestconfig, tmpdir_factory): "javax.jdo.option.ConnectionURL", f"jdbc:derby:;databaseName={temp_warehouse_dir}/metastore_db;create=true", ) + .set("spark.executor.cores", 1) ) session, _, _ = improve_performances(to_add_conf=list(conf.getAll())) diff --git a/tests/flowchart/expected_flowchart_bis.png b/tests/flowchart/expected_flowchart_bis.png new file mode 100644 index 00000000..3d6c42e2 Binary files /dev/null and b/tests/flowchart/expected_flowchart_bis.png differ diff --git a/tests/flowchart/test_flowchart.py b/tests/flowchart/test_flowchart.py index f127e4f3..bc0f98d1 100644 --- a/tests/flowchart/test_flowchart.py +++ b/tests/flowchart/test_flowchart.py @@ -5,7 +5,7 @@ import pytest from eds_scikit.utils.flowchart import Flowchart -from eds_scikit.utils.test_utils import assert_images_equal +from eds_scikit.utils.test_utils import image_diff data_as_df = pd.DataFrame( dict( @@ -63,12 +63,13 @@ def test_flowchart(data, tmpdir_factory): _ = F.generate_flowchart(alternate=True, fontsize=10) - result_path = tmp_dir / "flowchart.png" - F.save(result_path, dpi=72) + out_path = str(tmp_dir / "flowchart.png") + F.save(out_path, dpi=72) - expected = Path(__file__).parent / "expected_flowchart.png" + tgt_1 = str(Path(__file__).parent / "expected_flowchart.png") + tgt_2 = str(Path(__file__).parent / "expected_flowchart_bis.png") - assert_images_equal(result_path, expected) + assert image_diff(out_path, tgt_1) < 0.05 or image_diff(out_path, tgt_2) < 0.05 def test_incorrect_data(): diff --git a/tests/test_biology.py b/tests/test_biology.py index eb046ec9..12af6e48 100644 --- a/tests/test_biology.py +++ b/tests/test_biology.py @@ -14,7 +14,10 @@ def tmp_biology_dir(tmp_path_factory): @pytest.fixture def data(): - return load_biology_data(seed=42) + return load_biology_data( + seed=42, + mean_measurement=500, + ) @pytest.fixture @@ -73,6 +76,7 @@ def test_biology_summary(data, concepts_sets, module, tmp_biology_dir): limit_count=("AnaBio", 500), stats_only=True, save_folder_path=tmp_biology_dir, + pd_limit_size=0, ) diff --git a/tests/test_convert.py b/tests/test_convert.py index 671bbe28..ee9b1f3d 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -51,9 +51,9 @@ def test_framework_koalas(example_objects): def test_unconvertible_objects(): objects = [1, "coucou", {"a": [1, 2]}, [1, 2, 3], 2.5, ks, pd] for obj in objects: - with pytest.raises(ValueError): + with pytest.raises((ValueError, TypeError)): framework.pandas(obj) for obj in objects: - with pytest.raises(ValueError): + with pytest.raises((ValueError, TypeError)): framework.koalas(obj)