Skip to content

Commit

Permalink
fix(datasets): Don't warn for SparkDataset on Databricks when using s3 (
Browse files Browse the repository at this point in the history
kedro-org#341)

Signed-off-by: Alistair McKelvie <[email protected]>
  • Loading branch information
alamastor authored Oct 12, 2023
1 parent cc75b40 commit b946eec
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 5 deletions.
6 changes: 3 additions & 3 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
* Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`.

## Bug fixes and other changes
* Fix erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks.
* Updated `PickleDataset` to explicitly mention `cloudpickle` support.

## Upcoming deprecations for Kedro-Datasets 2.0.0
## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
* [PtrBld](https://github.com/PtrBld)

## Community contributions
Many thanks to the following Kedroids for contributing PRs to this release:
* [Alistair McKelvie](https://github.com/alamastor)
* [Felix Wittmann](https://github.com/hfwittmann)

# Release 1.7.1
Expand Down
13 changes: 11 additions & 2 deletions kedro-datasets/kedro_datasets/spark/spark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import fsspec
from hdfs import HdfsError, InsecureClient
from kedro.io.core import Version, get_filepath_str, get_protocol_and_path
from kedro.io.core import (
CLOUD_PROTOCOLS,
Version,
get_filepath_str,
get_protocol_and_path,
)
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.utils import AnalysisException
Expand Down Expand Up @@ -284,7 +289,11 @@ def __init__( # noqa: PLR0913
glob_function = None
self.metadata = metadata

if not filepath.startswith("/dbfs/") and _deployed_on_databricks():
if (
not filepath.startswith("/dbfs/")
and fs_prefix not in (protocol + "://" for protocol in CLOUD_PROTOCOLS)
and _deployed_on_databricks()
):
logger.warning(
"Using SparkDataset on Databricks without the `/dbfs/` prefix in the "
"filepath is a known source of error. You must add this prefix to %s",
Expand Down
6 changes: 6 additions & 0 deletions kedro-datasets/tests/spark/test_spark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,12 @@ def test_dbfs_prefix_warning_on_databricks_no_prefix(self, monkeypatch, caplog):
SparkDataset(filepath=filepath)
assert expected_message in caplog.text

def test_dbfs_prefix_warning_databricks_s3(self, monkeypatch, caplog):
# test that warning is not raised when on Databricks using an s3 path
monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3")
SparkDataset(filepath="s3://my_project/data/02_intermediate/processed_data")
assert caplog.text == ""


class TestSparkDatasetVersionedLocal:
def test_no_version(self, versioned_dataset_local):
Expand Down

0 comments on commit b946eec

Please sign in to comment.