Merge branch 'develop' into m/_/simplify_data_context_id_logic

great-expectations · Sep 20, 2024 · b3c9c56 · b3c9c56
2 parents 432423b + 1e13ef6
commit b3c9c56
Show file tree

Hide file tree

Showing 34 changed files with 528 additions and 79 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -433,6 +433,7 @@ jobs:
           - postgresql
           - snowflake
           - spark
+          - spark_connect
           - trino
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
         exclude:

diff --git a/assets/docker/spark/docker-compose.yml b/assets/docker/spark/docker-compose.yml
@@ -4,3 +4,10 @@ services:
     ports:
       - "9090:8080"
       - "7077:7077"
+
+  spark-connect:
+    image: ${ECR_PULL_THROUGH_REPOSITORY_URL}bitnami/spark:3.5.2
+    ports:
+      - "15002:15002"
+    # See https://spark.apache.org/docs/latest/spark-connect-overview.html#download-and-start-spark-server-with-spark-connect
+    command: ./sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.5.2
diff --git a/docs/docusaurus/docs/cloud/connect/connect_databrickssql.md b/docs/docusaurus/docs/cloud/connect/connect_databrickssql.md
@@ -0,0 +1,42 @@
+---
+sidebar_label: 'Connect GX Cloud to Databricks SQL'
+title: 'Connect GX Cloud to Databricks SQL'
+description: Connect GX Cloud to a Databricks SQL Data Source.
+---
+
+import TabItem from '@theme/TabItem';
+import Tabs from '@theme/Tabs';
+
+## Prerequisites
+
+- You have a [GX Cloud account](https://greatexpectations.io/cloud) with [Admin or Editor permissions](../about_gx.md#roles-and-responsibilities).
+
+- You have a Databricks SQL catalog, schema, and table.
+
+- To improve data security, GX recommends creating a separate Databricks SQL [service principal](https://docs.databricks.com/en/admin/users-groups/service-principals.html#manage-service-principals-in-your-account) for your GX Cloud connection.
+
+
+## Connect to a Databricks SQL Data Asset
+
+1. In GX Cloud, click **Data Assets** > **New Data Asset** > **Databricks SQL**.
+
+2. Enter a meaningful name for the Data Source in the **Data Source name** field.
+
+3. Enter a connection string in the **Connection string** field. The connection string format is `databricks://token:{token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}`.
+    - Check the instructions to create a GX-specific user in your Databricks SQL catalog by clicking "See instructions"
+
+4. Click **Connect**.
+
+5. Select tables to import as Data Assets:
+
+    - Check the box next to a table name to add that table as an asset.
+
+    - At least one table must be added.
+
+    - To search for a specific table type the table's name in the Search box above the list of tables.
+
+    - To add all of the available tables check the box for All Tables.
+
+6. Click **Add Asset**.
+
+7. Create an Expectation. See [Create an Expectation](/cloud/expectations/manage_expectations.md#create-an-expectation).
diff --git a/docs/docusaurus/docs/cloud/connect/connect_lp.md b/docs/docusaurus/docs/cloud/connect/connect_lp.md
@@ -17,6 +17,7 @@ import OverviewCard from '@site/src/components/OverviewCard';
 <LinkCardGrid>
   <LinkCard topIcon label="Connect GX Cloud to PostgreSQL" description="Quickly start using GX Cloud with PostgreSQL." to="/cloud/connect/connect_postgresql" icon="/img/postgresql_icon.svg" />
   <LinkCard topIcon label="Connect GX Cloud to Snowflake" description="Quickly start using GX Cloud with Snowflake." to="/cloud/connect/connect_snowflake" icon="/img/snowflake_icon.png" />
+  <LinkCard topIcon label="Connect GX Cloud to Databricks SQL" description="Quickly start using GX Cloud with Databricks SQL." to="/cloud/connect/connect_databrickssql" icon="/img/databricks_icon.svg" />
   <LinkCard topIcon label="Connect GX Cloud and Airflow" description="Use Airflow to run scheduled GX Cloud validations." to="/cloud/connect/connect_airflow" icon="/img/airflow_icon.png" />
   <LinkCard topIcon label="Connect to GX Cloud with Python" description="Quickly start using GX Cloud with Python." to="/cloud/connect/connect_python" icon="/img/python_icon.svg" />
 </LinkCardGrid>
diff --git a/docs/docusaurus/docs/cloud/expectations/manage_expectations.md b/docs/docusaurus/docs/cloud/expectations/manage_expectations.md
@@ -8,12 +8,6 @@ An Expectation is a verifiable assertion about your data. They make implicit ass
 
 <!-- [//]: # (TODO: To learn more about Expectations, see Expectation.) -->
 
-:::info Custom SQL Query Expectations
-
-To create custom SQL query Expectations, you'll need to use the GX API. See [Customize Expectations](/core/customize_expectations/customize_expectations.md).
-
-:::
-
 ## Prerequisites
 
 - You have a [Data Asset](/cloud/data_assets/manage_data_assets.md#create-a-data-asset).
@@ -23,7 +17,7 @@ To create custom SQL query Expectations, you'll need to use the GX API. See [Cus
 The following table lists the available GX Cloud Expectations.
 
 | Data Quality Issue | Expectation                                               | Description                                                                                                                            |
-| ------------------ | --------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+|--------------------|-----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
 | Cardinality        | `expect_column_values_to_be_unique`                       | Expect each column value to be unique.                                                                                                 |
 | Cardinality        | `expect_compound_columns_to_be_unique`                    | Expect the compound columns to be unique.                                                                                              |
 | Cardinality        | `expect_select_column_values_to_be_unique_within_record`  | Expect the values for each record to be unique across the columns listed. Note that records can be duplicated.                         |
@@ -69,6 +63,18 @@ The following table lists the available GX Cloud Expectations.
 | Volume             | `expect_table_row_count_to_equal`                         | Expect the number of rows to equal a value.                                                                                            |
 | Volume             | `expect_table_row_count_to_equal_other_table`             | Expect the number of rows to equal the number in another table within the same database.                                               |
 
+## Custom SQL Expectations
+
+GX Cloud also offers the ability to write a custom Expectation using SQL. It is designed to fail validation if the provided SQL query returns one or more rows.
+
+The provided query should be written in the dialect of the Data Source in which a given Data Asset lives.
+
+:::info Optional `{batch}` named query
+
+The optional `{batch}` named query references the Batch of data under test. When the Expectation is evaluated, the `{batch}` named query will be replaced with the Batch of data that is validated.
+
+:::
+
 ## Add an Expectation
 
 1. In GX Cloud, click **Data Assets**.

diff --git a/docs/docusaurus/docs/components/_data.jsx b/docs/docusaurus/docs/components/_data.jsx
@@ -1,5 +1,5 @@
 export default {
-  release_version: 'great_expectations, version 1.0.3',
+  release_version: 'great_expectations, version 1.0.5',
   min_python: '3.8',
   max_python: '3.11'
 }
diff --git a/...urus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md b/...urus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md
@@ -11,7 +11,7 @@ import PrereqPreconfiguredDataSourceAndAsset from '../_core_components/prerequis
 
 Among the available Expectations, the `UnexpectedRowsExpectation` is designed to facilitate the execution of SQL or Spark-SQL queries as the core logic for an Expectation.  By default, `UnexpectedRowsExpectation` considers validation successful when no rows are returned by the provided SQL query.
 
-You customize an `UnexpectedRowsExpectation` in essentially the same manner as you would [define a custom Expectation](/core/customize_expectations/define_a_custom_expectation_class.md), by subclassing `UnexpectedRowsExpectation` and providing customized default attributes and text for Data Docs. However, there are some caveats around the `UnexpectedRowsExpectation`'s `unexpected_rows_query` attribute that deserve further detail.
+Like any other Expectation, you can instantiate the `UnexpectedRowsExpectation` directly. You can also customize an `UnexpectedRowsExpectation` in essentially the same manner as you would [define a custom Expectation](/core/customize_expectations/define_a_custom_expectation_class.md), by subclassing `UnexpectedRowsExpectation` and providing customized default attributes and text for Data Docs. However, there are some caveats around the `UnexpectedRowsExpectation`'s `unexpected_rows_query` attribute that deserve further detail.
 
 <!-- TODO: Do we want to discuss custom `_validate(...)` logic here, or should that be held for a future topic on building custom Expectation classes from scratch? -->
 
@@ -48,7 +48,7 @@ You customize an `UnexpectedRowsExpectation` in essentially the same manner as y
 
    The `unexpected_rows_query` attribute is a SQL or Spark-SQL query that returns a selection of rows from the Batch of data being validated.  By default, rows that are returned have failed the validation check.
 
-   Although the `unexpected_rows_query` should be written in standard SQL or Spark-SQL syntax, it must also contain the special `{batch}` placeholder.  When the Expectation is evaluated, the `{batch}` placeholder will be replaced with the Batch of data that is validated.
+   The `unexpected_rows_query` should be written in standard SQL or Spark-SQL syntax, except that it can also contain the special `{batch}` named query.  When the Expectation is evaluated, the `{batch}` keyword will be replaced with the Batch of data that is configured for your Data Asset.
 
    In this example, `unexpected_rows_query` will select any rows where the passenger count is greater than `6` or less than `0`.  These rows will fail validation for this Expectation:
 

diff --git a/docs/docusaurus/docs/oss/changelog.md b/docs/docusaurus/docs/oss/changelog.md
@@ -14,6 +14,24 @@ When we deprecate our public APIs, we will
 
 Before we completely remove the functionality in a new major release, there will be at least one minor release that contains the deprecation so that you can smoothly transition to the new API.
 
+### 1.0.5
+* [BUGFIX] Using `{batch}` keyword in `UnexpectedRowsQuery` ([#10392](https://github.com/great-expectations/great_expectations/pull/10392))
+* [BUGFIX] Fix Databricks SQL Regex and Like based Expectations ([#10406](https://github.com/great-expectations/great_expectations/pull/10406))
+* [BUGFIX] Support Spark connect dataframes ([#10420](https://github.com/great-expectations/great_expectations/pull/10420))
+* [BUGFIX] Handle DatabricksSQL attribute error and update dependency ([#10424](https://github.com/great-expectations/great_expectations/pull/10424))
+* [DOCS] Add Connect to Databricks SQL page in GX Cloud ([#10394](https://github.com/great-expectations/great_expectations/pull/10394)) (thanks @allisongx)
+* [DOCS] Changelog updates `0.18.18` -> `0.18.21` ([#10422](https://github.com/great-expectations/great_expectations/pull/10422))
+* [DOCS] Add Connect to Databricks SQL to GX Cloud docs TOC ([#10423](https://github.com/great-expectations/great_expectations/pull/10423))
+* [MAINTENANCE] Fix `SQLAlchemyExectionEngine.get_connection()` typing + update column identifier tests ([#10399](https://github.com/great-expectations/great_expectations/pull/10399))
+* [MAINTENANCE] Move FabricPowerBIDatasource out of experimental dir ([#10419](https://github.com/great-expectations/great_expectations/pull/10419))
+
+### 1.0.4
+* [BUGFIX] Fix action equality ([#10393](https://github.com/great-expectations/great_expectations/pull/10393))
+* [BUGFIX] Patch additional issues with data docs page retrieval in checkpoint actions ([#10400](https://github.com/great-expectations/great_expectations/pull/10400))
+* [DOCS] Add CTAs to request a demo ([#10389](https://github.com/great-expectations/great_expectations/pull/10389))
+* [MAINTENANCE] Ensure that all nested validation definition diagnostics are emitted from a parent checkpoint ([#10386](https://github.com/great-expectations/great_expectations/pull/10386))
+* [MAINTENANCE] Fix `SQLAlchemyExectionEngine.get_connection()` typing + update column identifier tests ([#10399](https://github.com/great-expectations/great_expectations/pull/10399))
+
 ### 1.0.3
 * [FEATURE] Replace get_batch_list_from_batch_request with get_batch and get_batch_identifiers_list ([#10295](https://github.com/great-expectations/great_expectations/pull/10295))
 * [FEATURE] Add Checkpoint.run analytics ([#10382](https://github.com/great-expectations/great_expectations/pull/10382))

diff --git a/docs/docusaurus/docusaurus.config.js b/docs/docusaurus/docusaurus.config.js
@@ -298,10 +298,10 @@ module.exports = {
           lastVersion: 'current',
           versions: {
             current: {
-              label: '1.0.3',
+              label: '1.0.5',
             },
             ['0.18']: {
-              label: '0.18.17',
+              label: '0.18.21',
             },
           },
           admonitions: {
@@ -325,6 +325,12 @@ module.exports = {
           // Optional fields.
           anonymizeIP: true, // Should IPs be anonymized?
         },
+        sitemap: {
+          ignorePatterns: [
+            '**/0.18/oss/templates/**',
+            '**/0.18/oss/team_templates/**'
+          ],
+        }
       },
     ],
   ],

diff --git a/docs/docusaurus/sidebars.js b/docs/docusaurus/sidebars.js
@@ -190,6 +190,7 @@ module.exports = {
           items: [
             'cloud/connect/connect_postgresql',
             'cloud/connect/connect_snowflake',
+            'cloud/connect/connect_databrickssql',
             'cloud/connect/connect_airflow',
             'cloud/connect/connect_python',
           ]
@@ -251,6 +252,11 @@ module.exports = {
               label: 'Available Expectations',
               href: '/docs/cloud/expectations/manage_expectations#available-expectations',
             },
+            {
+              type: 'link',
+              label: 'Custom SQL Expectations',
+              href: '/docs/cloud/expectations/manage_expectations#custom-sql-expectations',
+            },
             {
               type: 'link',
               label: 'Add an Expectation',

diff --git a/docs/docusaurus/versioned_docs/version-0.18/oss/changelog.md b/docs/docusaurus/versioned_docs/version-0.18/oss/changelog.md
@@ -10,6 +10,31 @@ title: Changelog
 - Deprecation warnings are accompanied by a moniker (as a code comment) indicating when they were deprecated.  For example: `# deprecated-v0.13`
 - Changes to methods and parameters due to deprecation are also noted in the relevant docstrings.
 
+### 0.18.21
+* [BUGFIX] Using `{batch}` keyword in `UnexpectedRowsQuery` (#10392) ([#10411](https://github.com/great-expectations/great_expectations/pull/10411))
+* [BUGFIX] 0.18.x Ignore unsupported INTERVAL type as part of CDM ([#10414](https://github.com/great-expectations/great_expectations/pull/10414))
+* [BUGFIX] 0.18.x Databricks SQL Pattern Expectation Fix ([#10415](https://github.com/great-expectations/great_expectations/pull/10415))
+* [MAINTENANCE] Pass `description` from `Validator` to `ExpectationConfiguration` ([#10388](https://github.com/great-expectations/great_expectations/pull/10388))
+
+### 0.18.20
+* [FEATURE] Add `UnexpectedRowsExpectation` ([#10342](https://github.com/great-expectations/great_expectations/pull/10342))
+* [BUGFIX] Remove illegible duplicate local Data Docs link from Slack renderer ([#10129](https://github.com/great-expectations/great_expectations/pull/10129))
+* [MAINTENANCE] Ruff 0.5.3 + PR annotations ([#10128](https://github.com/great-expectations/great_expectations/pull/10128))
+* [MAINTENANCE] Fix 0.18.x CI ([#10199](https://github.com/great-expectations/great_expectations/pull/10199))
+* [MAINTENANCE] Update column identifier tests ([#8783](https://github.com/great-expectations/great_expectations/pull/8783))
+
+### 0.18.19
+* [FEATURE] Snowflake test for the presence of a schema in `test_connection()` ([#10100](https://github.com/great-expectations/great_expectations/pull/10100))
+* [BUGFIX] Z-score renderer when `double_sided` ([#10085](https://github.com/great-expectations/great_expectations/pull/10085))
+* [BUGFIX] SQLDatasource - lowercase unquoted `schema_names` for SQLAlchemy case-sensitivity compatibility ([#10107](https://github.com/great-expectations/great_expectations/pull/10107))
+* [MAINTENANCE] Export `great_expectations.compatibility` types ([#10089](https://github.com/great-expectations/great_expectations/pull/10089))
+* [MAINTENANCE] 0.18.x - mypy - `possibly-undefined` ([#10091](https://github.com/great-expectations/great_expectations/pull/10091))
+* [MAINTENANCE] loosen ruamel pin ([#10081](https://github.com/great-expectations/great_expectations/pull/10081))
+
+### 0.18.18
+* [FEATURE] Add atomic renderer for `ExpectMulticolumnSumToEqual` (#10076) ([#10077](https://github.com/great-expectations/great_expectations/pull/10077))
+* [FEATURE] Snowflake - narrow Account Identifier regex ([#10069](https://github.com/great-expectations/great_expectations/pull/10069))
+* [FEATURE] Add missing atomic renderers to Expectations (#10079) ([#10080](https://github.com/great-expectations/great_expectations/pull/10080))
 
 ### 0.18.17
 * [FEATURE] Snowflake - Better Account Identifier related TestConnectionErrors ([#10043](https://github.com/great-expectations/great_expectations/pull/10043))

diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py
@@ -5,6 +5,6 @@
 )
 
 try:
-    from databricks import connect  # type: ignore[import-untyped]
+    from databricks import connect
 except ImportError:
     connect = DATABRICKS_CONNECT_NOT_IMPORTED
diff --git a/great_expectations/compatibility/pyspark.py b/great_expectations/compatibility/pyspark.py
@@ -39,6 +39,11 @@
 except (ImportError, AttributeError):
     Column = SPARK_NOT_IMPORTED  # type: ignore[assignment,misc]
 
+try:
+    from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
+except (ImportError, AttributeError):
+    ConnectDataFrame = SPARK_NOT_IMPORTED  # type: ignore[assignment,misc]
+
 try:
     from pyspark.sql import DataFrame
 except (ImportError, AttributeError):

diff --git a/great_expectations/datasource/fluent/__init__.py b/great_expectations/datasource/fluent/__init__.py
@@ -50,7 +50,7 @@
 from great_expectations.datasource.fluent.pandas_azure_blob_storage_datasource import (
     PandasAzureBlobStorageDatasource,
 )
-from great_expectations.experimental.datasource.fabric import FabricPowerBIDatasource
+from great_expectations.datasource.fluent.fabric import FabricPowerBIDatasource
 from great_expectations.datasource.fluent.postgres_datasource import (
     PostgresDatasource,
 )

diff --git a/...tations/experimental/datasource/fabric.py → ..._expectations/datasource/fluent/fabric.py b/...tations/experimental/datasource/fabric.py → ..._expectations/datasource/fluent/fabric.py
diff --git a/great_expectations/datasource/fluent/spark_datasource.py b/great_expectations/datasource/fluent/spark_datasource.py
@@ -5,6 +5,7 @@
 from pprint import pformat as pf
 from typing import (
     TYPE_CHECKING,
+    Any,
     ClassVar,
     Dict,
     Generic,
@@ -27,7 +28,7 @@
     StrictInt,
     StrictStr,
 )
-from great_expectations.compatibility.pyspark import DataFrame, pyspark
+from great_expectations.compatibility.pyspark import ConnectDataFrame, DataFrame, pyspark
 from great_expectations.compatibility.typing_extensions import override
 from great_expectations.core import IDDict
 from great_expectations.core.batch import LegacyBatchDefinition
@@ -47,7 +48,7 @@
 from great_expectations.exceptions.exceptions import BuildBatchRequestError
 
 if TYPE_CHECKING:
-    from typing_extensions import TypeAlias
+    from typing_extensions import TypeAlias, TypeGuard
 
     from great_expectations.compatibility.pyspark import SparkSession
     from great_expectations.core.batch_definition import BatchDefinition
@@ -231,9 +232,9 @@ def build_batch_request(
         if not (options is not None and "dataframe" in options and len(options) == 1):
             raise BuildBatchRequestError(message="options must contain exactly 1 key, 'dataframe'.")
 
-        if not isinstance(options["dataframe"], DataFrame):
+        if not self.is_spark_data_frame(options["dataframe"]):
             raise BuildBatchRequestError(
-                message="Can not build batch request for dataframe asset " "without a dataframe."
+                message="Cannot build batch request without a Spark DataFrame."
             )
 
         return BatchRequest(
@@ -255,7 +256,7 @@ def _validate_batch_request(self, batch_request: BatchRequest) -> None:
             and batch_request.options
             and len(batch_request.options) == 1
             and "dataframe" in batch_request.options
-            and isinstance(batch_request.options["dataframe"], DataFrame)
+            and self.is_spark_data_frame(batch_request.options["dataframe"])
         ):
             expect_batch_request_form = BatchRequest[None](
                 datasource_name=self.datasource.name,
@@ -314,6 +315,14 @@ def add_batch_definition_whole_dataframe(self, name: str) -> BatchDefinition:
             partitioner=None,
         )
 
+    @staticmethod
+    def is_spark_data_frame(df: Any) -> TypeGuard[Union[DataFrame, ConnectDataFrame]]:
+        """Check that a given object is a Spark DataFrame.
+        This could either be a regular Spark DataFrame or a Spark Connect DataFrame.
+        """
+        data_frame_types = [DataFrame, ConnectDataFrame]
+        return any((cls and isinstance(df, cls)) for cls in data_frame_types)
+
 
 @public_api
 class SparkDatasource(_SparkDatasource):

diff --git a/great_expectations/deployment_version b/great_expectations/deployment_version
@@ -1 +1 @@
-1.0.3
+1.0.5