great-expectations · billdirks · Aug 6, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/...cs/snippets/get_existing_data_asset_from_existing_datasource_pandas_filesystem_example.py b/...cs/snippets/get_existing_data_asset_from_existing_datasource_pandas_filesystem_example.py
@@ -44,7 +44,7 @@
 
 # Python
 # <snippet name="docs/docusaurus/docs/snippets/get_existing_data_asset_from_existing_datasource_pandas_filesystem_example.py build_batch_request_with_dataframe">
-my_batch_request = my_asset.build_batch_request(dataframe=dataframe)
+my_batch_request = my_asset.build_batch_request(options={"dataframe": dataframe})
 # </snippet>
 
 # Python

diff --git a/docs/docusaurus/docs/snippets/result_format.py b/docs/docusaurus/docs/snippets/result_format.py
@@ -19,7 +19,7 @@
 context = gx.get_context()
 datasource = context.data_sources.add_pandas(name="my_pandas_datasource")
 data_asset = datasource.add_dataframe_asset(name="my_df")
-my_batch_request = data_asset.build_batch_request(dataframe=dataframe)
+my_batch_request = data_asset.build_batch_request(options={"dataframe": dataframe})
 context.suites.add(ExpectationSuite(name="my_expectation_suite"))
 my_validator = context.get_validator(
     batch_request=my_batch_request,

diff --git a/great_expectations/datasource/datasource_dict.py b/great_expectations/datasource/datasource_dict.py
@@ -129,13 +129,7 @@
                         datasource_name=name,
                         data_asset_name=asset.name,
                     )
-                    cached_data_asset = self._in_memory_data_assets.get(in_memory_asset_name)
-                    if cached_data_asset:
-                        asset.dataframe = cached_data_asset.dataframe
-                    else:
-                        # Asset is loaded into cache here (even without df) to enable loading of df at a later  # noqa: E501
-                        # time when DataframeAsset.build_batch_request(dataframe=df) is called
-                        self._in_memory_data_assets[in_memory_asset_name] = asset
+                    self._in_memory_data_assets[in_memory_asset_name] = asset
         return ds
 
 

diff --git a/great_expectations/datasource/fluent/interfaces.py b/great_expectations/datasource/fluent/interfaces.py
@@ -466,7 +466,9 @@ def ensure_batch_metadata_is_not_none(cls, value: Any) -> Union[dict, Any]:
             return {}
         return value
 
-    def _get_batch_metadata_from_batch_request(self, batch_request: BatchRequest) -> BatchMetadata:
+    def _get_batch_metadata_from_batch_request(
+        self, batch_request: BatchRequest, ignore_options: Sequence = ()
+    ) -> BatchMetadata:
         """Performs config variable substitution and populates batch parameters for
         Batch.metadata at runtime.
         """
@@ -477,7 +479,11 @@ def _get_batch_metadata_from_batch_request(self, batch_request: BatchRequest) ->
         batch_metadata = _ConfigurationSubstitutor().substitute_all_config_variables(
             data=batch_metadata, replace_variables_dict=config_variables
         )
-        batch_metadata.update(copy.deepcopy(batch_request.options))
+        batch_metadata.update(
+            copy.deepcopy(
+                {k: v for k, v in batch_request.options.items() if k not in ignore_options}
+            )
+        )
         return batch_metadata
 
     # Sorter methods
@@ -994,7 +1000,7 @@ def __init__(  # noqa: PLR0913
     def _create_id(self) -> str:
         options_list = []
         for key, value in self.batch_request.options.items():
-            if key != "path":
+            if key not in ("path", "dataframe"):
                 options_list.append(f"{key}_{value}")
         return "-".join([self.datasource.name, self.data_asset.name, *options_list])
 

diff --git a/great_expectations/datasource/fluent/pandas_datasource.py b/great_expectations/datasource/fluent/pandas_datasource.py
@@ -20,16 +20,13 @@
     Set,
     Tuple,
     Type,
-    TypeVar,
     Union,
 )
 
 import pandas as pd
 
 import great_expectations.exceptions as gx_exceptions
 from great_expectations._docs_decorators import (
-    deprecated_argument,
-    new_argument,
     public_api,
 )
 from great_expectations.compatibility import pydantic, sqlalchemy
@@ -52,6 +49,7 @@
 )
 from great_expectations.datasource.fluent.signatures import _merge_signatures
 from great_expectations.datasource.fluent.sources import DEFAULT_PANDAS_DATA_ASSET_NAME
+from great_expectations.exceptions.exceptions import BuildBatchRequestError
 
 _EXCLUDE_TYPES_FROM_JSON: list[Type] = [sqlite3.Connection]
 
@@ -78,10 +76,6 @@
 logger = logging.getLogger(__name__)
 
 
-# this enables us to include dataframe in the json schema
-_PandasDataFrameT = TypeVar("_PandasDataFrameT")
-
-
 class PandasDatasourceError(Exception):
     pass
 
@@ -119,7 +113,9 @@
     def get_batch_parameters_keys(
         self, partitioner: Optional[ColumnPartitioner] = None
     ) -> Tuple[str, ...]:
-        return tuple()
+        return tuple(
+            "dataframe",
+        )
 
     @override
     def get_batch_list_from_batch_request(self, batch_request: BatchRequest) -> list[Batch]:
@@ -154,7 +150,7 @@
         )
 
         batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request(
-            batch_request=batch_request
+            batch_request=batch_request, ignore_options=("dataframe",)
         )
 
         batch_list.append(
@@ -190,18 +186,21 @@
             get_batch_list_from_batch_request method.
         """  # noqa: E501
         if options:
-            raise ValueError(  # noqa: TRY003
-                "options is not currently supported for this DataAssets and must be None or {}."
+            raise BuildBatchRequestError(
+                message="options is not currently supported for this DataAsset "
+                "and must be None or {}."
             )
 
         if batch_slice is not None:
-            raise ValueError(  # noqa: TRY003
-                "batch_slice is not currently supported and must be None for this DataAsset."
+            raise BuildBatchRequestError(
+                message="batch_slice is not currently supported for this DataAsset "
+                "and must be None."
             )
 
         if partitioner is not None:
-            raise ValueError(  # noqa: TRY003
-                "partitioner is not currently supported and must be None for this DataAsset."
+            raise BuildBatchRequestError(
+                message="partitioner is not currently supported for this DataAsset "
+                "and must be None."
             )
 
         return BatchRequest(
@@ -356,21 +355,13 @@
     return str(uuid.uuid4()).replace("-", "")[:11]
 
 
-class DataFrameAsset(_PandasDataAsset, Generic[_PandasDataFrameT]):
+class DataFrameAsset(_PandasDataAsset):
     # instance attributes
     type: Literal["dataframe"] = "dataframe"
-    # TODO: <Alex>05/31/2023: Upon removal of deprecated "dataframe" argument to "PandasDatasource.add_dataframe_asset()", default can be deleted.</Alex>  # noqa: E501
-    dataframe: Optional[_PandasDataFrameT] = pydantic.Field(default=None, exclude=True, repr=False)
 
     class Config:
         extra = pydantic.Extra.forbid
 
-    @pydantic.validator("dataframe")
-    def _validate_dataframe(cls, dataframe: pd.DataFrame) -> pd.DataFrame:
-        if not isinstance(dataframe, pd.DataFrame):
-            raise ValueError("dataframe must be of type pandas.DataFrame")  # noqa: TRY003, TRY004
-        return dataframe
-
     @override
     def _get_reader_method(self) -> str:
         raise NotImplementedError(
@@ -382,64 +373,82 @@
             """Pandas DataFrameAsset does not implement "_get_reader_options_include()" method, because DataFrame is already available."""  # noqa: E501
         )
 
-    # TODO: <Alex>05/31/2023: Upon removal of deprecated "dataframe" argument to "PandasDatasource.add_dataframe_asset()", its validation code must be deleted.</Alex>  # noqa: E501
-    @new_argument(
-        argument_name="dataframe",
-        message='The "dataframe" argument is no longer part of "PandasDatasource.add_dataframe_asset()" method call; instead, "dataframe" is the required argument to "DataFrameAsset.build_batch_request()" method.',  # noqa: E501
-        version="0.16.15",
-    )
     @override
-    def build_batch_request(  # type: ignore[override]
+    def build_batch_request(
         self,
-        dataframe: Optional[pd.DataFrame] = None,
         options: Optional[BatchParameters] = None,
         batch_slice: Optional[BatchSlice] = None,
         partitioner: Optional[ColumnPartitioner] = None,
     ) -> BatchRequest:
         """A batch request that can be used to obtain batches for this DataAsset.
 
         Args:
-            dataframe: The Pandas Dataframe containing the data for this DataFrame data asset.
-            options: This is not currently supported and must be {}/None for this data asset.
+            options: This should have 1 key, 'dataframe', whose value is the datafame to validate.
             batch_slice: This is not currently supported and must be None for this data asset.
             partitioner: This is not currently supported and must be None for this data asset.
 
         Returns:
             A BatchRequest object that can be used to obtain a batch list from a Datasource by calling the
             get_batch_list_from_batch_request method.
         """  # noqa: E501
-        if options:
-            raise ValueError(  # noqa: TRY003
-                "options is not currently supported for this DataAssets and must be None or {}."
-            )
-
         if batch_slice is not None:
-            raise ValueError(  # noqa: TRY003
-                "batch_slice is not currently supported and must be None for this DataAsset."
+            raise BuildBatchRequestError(
+                message="batch_slice is not currently supported for this DataAsset "
+                "and must be None."
             )
 
         if partitioner is not None:
-            raise ValueError(  # noqa: TRY003
-                "partitioner is not currently supported and must be None for this DataAsset."
+            raise BuildBatchRequestError(
+                message="partitioner is not currently supported  for this DataAsset"
+                "and must be None."
             )
 
-        if dataframe is None:
-            df = self.dataframe
-        else:
-            df = dataframe  # type: ignore[assignment]
+        if not (options is not None and "dataframe" in options and len(options) == 1):
+            raise BuildBatchRequestError(message="options must contain exactly 1 key, 'dataframe'.")
 
-        if df is None:
-            raise ValueError("Cannot build batch request for dataframe asset without a dataframe")  # noqa: TRY003
+        if not isinstance(options["dataframe"], pd.DataFrame):
+            raise BuildBatchRequestError(
+                message="Cannot build batch request for dataframe asset without a dataframe"
+            )
 
-        self.dataframe = df
+        return BatchRequest(
+            datasource_name=self.datasource.name,
+            data_asset_name=self.name,
+            options=options,
+        )
+
+    @override
+    def _validate_batch_request(self, batch_request: BatchRequest) -> None:
+        """Validates the batch_request has the correct form.
 
-        return super().build_batch_request()
+        Args:
+            batch_request: A batch request object to be validated.
+        """
+        if not (
+            batch_request.datasource_name == self.datasource.name
+            and batch_request.data_asset_name == self.name
+            and batch_request.options
+            and len(batch_request.options) == 1
+            and "dataframe" in batch_request.options
+            and isinstance(batch_request.options["dataframe"], pd.DataFrame)
+        ):
+            expect_batch_request_form = BatchRequest[None](
+                datasource_name=self.datasource.name,
+                data_asset_name=self.name,
+                options={"dataframe": pd.DataFrame()},
+                batch_slice=batch_request._batch_slice_input,
+            )
+            raise gx_exceptions.InvalidBatchRequestError(  # noqa: TRY003
+                "BatchRequest should have form:\n"
+                f"{pf(expect_batch_request_form.dict())}\n"
+                f"but actually has form:\n{pf(batch_request.dict())}\n"
+            )
 
     @override
     def get_batch_list_from_batch_request(self, batch_request: BatchRequest) -> list[Batch]:
         self._validate_batch_request(batch_request)
 
-        batch_spec = RuntimeDataBatchSpec(batch_data=self.dataframe)
+        batch_spec = RuntimeDataBatchSpec(batch_data=batch_request.options["dataframe"])
         execution_engine: PandasExecutionEngine = self.datasource.get_execution_engine()
         data, markers = execution_engine.get_batch_data_and_markers(batch_spec=batch_spec)
 
@@ -459,7 +468,7 @@
         )
 
         batch_metadata: BatchMetadata = self._get_batch_metadata_from_batch_request(
-            batch_request=batch_request
+            batch_request=batch_request, ignore_options=("dataframe",)
         )
 
         return [
@@ -651,29 +660,22 @@
                     'Cannot execute "PandasDatasource.read_dataframe()" without a valid "dataframe" argument.'  # noqa: E501
                 )
 
-            batch_request = asset.build_batch_request(dataframe=dataframe)
+            batch_request = asset.build_batch_request(options={"dataframe": dataframe})
         else:
             batch_request = asset.build_batch_request()
 
         return asset.get_batch_list_from_batch_request(batch_request)[-1]
 
     @public_api
-    @deprecated_argument(
-        argument_name="dataframe",
-        message='The "dataframe" argument is no longer part of "PandasDatasource.add_dataframe_asset()" method call; instead, "dataframe" is the required argument to "DataFrameAsset.build_batch_request()" method.',  # noqa: E501
-        version="0.16.15",
-    )
     def add_dataframe_asset(
         self,
         name: str,
-        dataframe: Optional[pd.DataFrame] = None,
         batch_metadata: Optional[BatchMetadata] = None,
     ) -> DataFrameAsset:
         """Adds a Dataframe DataAsset to this PandasDatasource object.
 
         Args:
             name: The name of the Dataframe asset. This can be any arbitrary string.
-            dataframe: The Pandas Dataframe containing the data for this DataFrame data asset.
             batch_metadata: An arbitrary user defined dictionary with string keys which will get inherited by any
                             batches created from the asset.
 
@@ -684,7 +686,6 @@
             name=name,
             batch_metadata=batch_metadata or {},
         )
-        asset.dataframe = dataframe
         return self._add_asset(asset=asset)
 
     @public_api

diff --git a/great_expectations/datasource/fluent/pandas_datasource.pyi b/great_expectations/datasource/fluent/pandas_datasource.pyi
@@ -27,7 +27,6 @@ from typing_extensions import TypeAlias
 
 from great_expectations._docs_decorators import (
     deprecated_argument,
-    new_argument,
 )
 from great_expectations._docs_decorators import (
     public_api as public_api,
@@ -60,7 +59,6 @@ _EXCLUDE_TYPES_FROM_JSON: list[Type]
 MappingIntStrAny: TypeAlias = Mapping[Union[int, str], Any]
 AbstractSetIntStr: TypeAlias = AbstractSet[Union[int, str]]
 logger: Logger
-_PandasDataFrameT = TypeVar("_PandasDataFrameT")
 
 class PandasDatasourceError(Exception): ...
 
@@ -122,16 +120,13 @@ class XMLAsset(_PandasDataAsset): ...
 
 class DataFrameAsset(_PandasDataAsset):
     type: Literal["dataframe"]
-    dataframe: _PandasDataFrameT  # type: ignore[valid-type]
 
-    @new_argument(
-        argument_name="dataframe",
-        message='The "dataframe" argument is no longer part of "PandasDatasource.add_dataframe_asset()" method call; instead, "dataframe" is the required argument to "DataFrameAsset.build_batch_request()" method.',  # noqa: E501
-        version="0.16.15",
-    )
     @override
-    def build_batch_request(  # type: ignore[override]
-        self, dataframe: Optional[pd.DataFrame] = None
+    def build_batch_request(
+        self,
+        options: Optional[BatchParameters] = ...,
+        batch_slice: Optional[BatchSlice] = ...,
+        partitioner: Optional[ColumnPartitioner] = ...,
     ) -> BatchRequest: ...
     @override
     def get_batch_list_from_batch_request(self, batch_request: BatchRequest) -> list[Batch]: ...

diff --git a/great_expectations/datasource/fluent/schemas/PandasDatasource/DataFrameAsset.json b/great_expectations/datasource/fluent/schemas/PandasDatasource/DataFrameAsset.json
@@ -38,9 +38,6 @@
             "items": {
                 "$ref": "#/definitions/BatchDefinition"
             }
-        },
-        "dataframe": {
-            "title": "Dataframe"
         }
     },
     "required": [

diff --git a/great_expectations/datasource/fluent/schemas/SparkDatasource.json b/great_expectations/datasource/fluent/schemas/SparkDatasource.json
@@ -548,9 +548,6 @@
                     "items": {
                         "$ref": "#/definitions/BatchDefinition"
                     }
-                },
-                "dataframe": {
-                    "title": "Dataframe"
                 }
             },
             "required": [

diff --git a/great_expectations/datasource/fluent/schemas/SparkDatasource/DataFrameAsset.json b/great_expectations/datasource/fluent/schemas/SparkDatasource/DataFrameAsset.json
@@ -38,9 +38,6 @@
             "items": {
                 "$ref": "#/definitions/BatchDefinition"
             }
-        },
-        "dataframe": {
-            "title": "Dataframe"
         }
     },
     "required": [