feat: support pyspark 3 (via a databricks.koalas stub)

aphp · Dec 7, 2023 · 81621f6 · 81621f6
1 parent 31ba5f2
commit 81621f6
Show file tree

Hide file tree

Showing 13 changed files with 74 additions and 37 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -49,6 +49,18 @@ jobs:
     needs: check_skip
     if: ${{ needs.check_skip.outputs.skip == 'false' }}
     runs-on: "ubuntu-latest"
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - python-version: "3.7"
+            spark: "spark2"
+          - python-version: "3.7"
+            spark: "spark3"
+          - python-version: "3.8"
+            spark: "spark3"
+          - python-version: "3.9"
+            spark: "spark3"
     name: 'Testing on ubuntu'
     defaults:
       run:
@@ -61,13 +73,16 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.7'
-      - name: Install eds-scikit
-        shell: bash {0}
-        run: ./build_tools/github/install.sh
-      - name: Run tests
-        shell: bash {0}
-        run: ./build_tools/github/test.sh
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+      - name: Install dependencies
+        run: |
+          pip install -U "pip<23"
+          echo Installing eds-scikit with spark version ${{ matrix.spark }}
+          pip install --progress-bar off ".[${{ matrix.spark }}, dev, doc]"
+      - name: Run pytest
+        run: |
+          python -m pytest --pyargs tests -m "" --cov=eds_scikit
       - name: Upload coverage to CodeCov
         uses: codecov/codecov-action@v3
         if: success()

diff --git a/build_tools/github/install.sh b/build_tools/github/install.sh
diff --git a/build_tools/github/test.sh b/build_tools/github/test.sh
diff --git a/changelog.md b/changelog.md
@@ -5,6 +5,7 @@
 ### Changed
 
 - Support for pyarrow > 0.17.0
+- Support for pyspark 3 (to force pyspark 2, use `pip install eds-scikit[spark2]`)
 
 ### Fixed
 - Caching in spark instead of koalas to improve speed

diff --git a/docs/project_description.md b/docs/project_description.md
@@ -124,6 +124,7 @@ The goal of **Koalas** is precisely to avoid this issue. It aims at allowing cod
 
 ```python
 from databricks import koalas as ks
+# or from pyspark import pandas as ks, if you have spark 3
 
 # Converting the Spark DataFrame into a Koalas DataFrame
 visit_occurrence_koalas = visit_occurrence_spark.to_koalas()

diff --git a/eds_scikit/__init__.py b/eds_scikit/__init__.py
@@ -26,15 +26,15 @@
 from pyspark import SparkContext
 from pyspark.sql import SparkSession
 
-import eds_scikit.biology  # noqa: F401 --> To register functions
-
 pyarrow.open_stream = pyarrow.ipc.open_stream
 
 sys.path.insert(
     0, (pathlib.Path(__file__).parent / "package-override").absolute().as_posix()
 )
 os.environ["PYTHONPATH"] = ":".join(sys.path)
 
+import eds_scikit.biology  # noqa: F401 --> To register functions
+
 # Remove SettingWithCopyWarning
 pd.options.mode.chained_assignment = None
 

diff --git a/eds_scikit/biology/viz/aggregate.py b/eds_scikit/biology/viz/aggregate.py
@@ -135,6 +135,8 @@ def aggregate_concepts_set(
 
         # Extract concept-set
         measurement_std_filtered = get_measurement_std(measurement_valid, src_to_std)
+        if is_koalas(measurement_std_filtered):
+            measurement_std_filtered.spark.cache()
         measurement_std_filtered = measurement_std_filtered.drop(
             columns="source_concept_id"
         )

diff --git a/eds_scikit/package-override/databricks/__init__.py b/eds_scikit/package-override/databricks/__init__.py
diff --git a/eds_scikit/package-override/databricks/koalas/__init__.py b/eds_scikit/package-override/databricks/koalas/__init__.py
@@ -0,0 +1,17 @@
+# This file is used to override the databricks.koalas package with the pyspark.pandas
+# package, if the databricks.koalas package is not available (python >= 3.8)
+import sys
+import pyarrow  # noqa: E402, F401
+
+old_sys_path = sys.path.copy()
+sys.path.remove(next((p for p in sys.path if "package-override" in p), None))
+databricks = sys.modules.pop("databricks")
+sys.modules.pop("databricks.koalas")
+try:
+    from databricks.koalas import *  # noqa: E402, F401, F403
+except ImportError:
+    from pyspark.pandas import *  # noqa: E402, F401, F403
+
+    sys.modules["databricks"] = databricks
+    sys.modules["databricks.koalas"] = sys.modules["pyspark.pandas"]
+sys.path[:] = old_sys_path
diff --git a/eds_scikit/package-override/pyarrow/__init__.py b/eds_scikit/package-override/pyarrow/__init__.py
@@ -21,18 +21,17 @@
    is the only one that resolves to this very module, still gets what it asked for:
    the pyarrow module's content.
 """
-
 import sys
 
+old_sys_path = sys.path.copy()
 sys.path.remove(next((p for p in sys.path if "package-override" in p), None))
 del sys.modules["pyarrow"]
-import pyarrow  # noqa: E402, F401
 
-try:
-    import pyarrow.ipc
+import pyarrow  # noqa: E402, F401
+from pyarrow.ipc import open_stream  # noqa: E402, F401
 
-    pyarrow.open_stream = pyarrow.ipc.open_stream
-except ImportError:
-    pass
+pyarrow.open_stream = open_stream
 
 from pyarrow import *  # noqa: F401, F403, E402
+
+sys.path[:] = old_sys_path
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,19 +35,18 @@ dependencies = [
     "pgpasslib>=1.1.0, <2.0.0",
     "psycopg2-binary>=2.9.0, <3.0.0",
     "pandas>=1.3.0, <2.0.0",
-    "numpy>=1.0.0, <1.20",
-    "koalas>=1.8.1, <2.0.0",
+    "numpy>=1.0.0",
     "altair>=5.0.0, <6.0.0",
     "loguru==0.7.0",
     "pypandoc==1.7.5",
-    "pyspark==2.4.3",
+    "pyspark",
     "pyarrow>=0.10.0",
     "pretty-html-table>=0.9.15, <0.10.0",
     "catalogue",
     "schemdraw>=0.15.0, <1.0.0",
-    "ipython>=7.32.0, <8.0.0",
-    "packaging==21.3",
-    "tomli==2.0.1",
+    "ipython>=7.32.0",
+    "packaging>=21.3",
+    "tomli>=2.0.1",
 ]
 dynamic = ['version']
 
@@ -66,6 +65,10 @@ Documentation = "https://aphp.github.io/eds-scikit"
 "Bug Tracker" = "https://github.com/aphp/eds-scikit/issues"
 
 [project.optional-dependencies]
+spark2 = [
+    "pyspark==2.4.3",
+    "koalas>=1.8.1,<2.0.0",
+]
 dev = [
     "black>=22.3.0, <23.0.0",
     "flake8==3.9.2",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,8 @@
+# isort: skip_file
 import logging
 import os
 
+import eds_scikit
 import pandas as pd
 import pytest
 from _pytest.logging import caplog as _caplog  # noqa F401
@@ -78,17 +80,13 @@ def spark_session(pytestconfig, tmpdir_factory):
         print("!! Creating spark session !!")
 
         from pyspark import SparkConf
+        from pyspark import __version__ as pyspark_version
 
         temp_warehouse_dir = tmpdir_factory.mktemp("spark")
         conf = (
             SparkConf()
             .setMaster("local")
             .setAppName("testing")
-            # used to overwrite hive tables
-            .set(
-                "spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation",
-                "true",
-            )
             # Path to data and metastore
             # Note: the option "hive.metastore.warehouse.dir" is deprecated
             # But javax.jdo.option.ConnectionURL can be used for the path of 'metastrore_db'
@@ -101,8 +99,17 @@ def spark_session(pytestconfig, tmpdir_factory):
                 "javax.jdo.option.ConnectionURL",
                 f"jdbc:derby:;databaseName={temp_warehouse_dir}/metastore_db;create=true",
             )
+            .set("spark.executor.cores", 1)
         )
 
+        if pyspark_version < "3":
+
+            # used to overwrite hive tables
+            conf = conf.set(
+                "spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation",
+                "true",
+            )
+
         session, _, _ = improve_performances(to_add_conf=list(conf.getAll()))
 
         # session is ready

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -51,9 +51,9 @@ def test_framework_koalas(example_objects):
 def test_unconvertible_objects():
     objects = [1, "coucou", {"a": [1, 2]}, [1, 2, 3], 2.5, ks, pd]
     for obj in objects:
-        with pytest.raises(ValueError):
+        with pytest.raises((ValueError, TypeError)):
             framework.pandas(obj)
 
     for obj in objects:
-        with pytest.raises(ValueError):
+        with pytest.raises((ValueError, TypeError)):
             framework.koalas(obj)