fix(datasets): Don't warn for SparkDataset on Databricks when using s3 (

kedro-org#341) Signed-off-by: Alistair McKelvie <[email protected]>
McDonnellJoseph · Oct 12, 2023 · b946eec · b946eec
1 parent cc75b40
commit b946eec
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 5 deletions.
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -3,14 +3,14 @@
 * Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`.
 
 ## Bug fixes and other changes
+* Fix erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks.
 * Updated `PickleDataset` to explicitly mention `cloudpickle` support.
+
 ## Upcoming deprecations for Kedro-Datasets 2.0.0
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:
 * [PtrBld](https://github.com/PtrBld)
-
-## Community contributions
-Many thanks to the following Kedroids for contributing PRs to this release:
+* [Alistair McKelvie](https://github.com/alamastor)
 * [Felix Wittmann](https://github.com/hfwittmann)
 
 # Release 1.7.1

diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py
@@ -14,7 +14,12 @@
 
 import fsspec
 from hdfs import HdfsError, InsecureClient
-from kedro.io.core import Version, get_filepath_str, get_protocol_and_path
+from kedro.io.core import (
+    CLOUD_PROTOCOLS,
+    Version,
+    get_filepath_str,
+    get_protocol_and_path,
+)
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
 from pyspark.sql.utils import AnalysisException
@@ -284,7 +289,11 @@ def __init__(  # noqa: PLR0913
         glob_function = None
         self.metadata = metadata
 
-        if not filepath.startswith("/dbfs/") and _deployed_on_databricks():
+        if (
+            not filepath.startswith("/dbfs/")
+            and fs_prefix not in (protocol + "://" for protocol in CLOUD_PROTOCOLS)
+            and _deployed_on_databricks()
+        ):
             logger.warning(
                 "Using SparkDataset on Databricks without the `/dbfs/` prefix in the "
                 "filepath is a known source of error. You must add this prefix to %s",

diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py
@@ -495,6 +495,12 @@ def test_dbfs_prefix_warning_on_databricks_no_prefix(self, monkeypatch, caplog):
         SparkDataset(filepath=filepath)
         assert expected_message in caplog.text
 
+    def test_dbfs_prefix_warning_databricks_s3(self, monkeypatch, caplog):
+        # test that warning is not raised when on Databricks using an s3 path
+        monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3")
+        SparkDataset(filepath="s3://my_project/data/02_intermediate/processed_data")
+        assert caplog.text == ""
+
 
 class TestSparkDatasetVersionedLocal:
     def test_no_version(self, versioned_dataset_local):