Skip to content

Commit

Permalink
Merge branch 'develop' into m/_/simplify_data_context_id_logic
Browse files Browse the repository at this point in the history
  • Loading branch information
cdkini authored Sep 20, 2024
2 parents 432423b + 1e13ef6 commit b3c9c56
Show file tree
Hide file tree
Showing 34 changed files with 528 additions and 79 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ jobs:
- postgresql
- snowflake
- spark
- spark_connect
- trino
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
exclude:
Expand Down
7 changes: 7 additions & 0 deletions assets/docker/spark/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,10 @@ services:
ports:
- "9090:8080"
- "7077:7077"

spark-connect:
image: ${ECR_PULL_THROUGH_REPOSITORY_URL}bitnami/spark:3.5.2
ports:
- "15002:15002"
# See https://spark.apache.org/docs/latest/spark-connect-overview.html#download-and-start-spark-server-with-spark-connect
command: ./sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.5.2
42 changes: 42 additions & 0 deletions docs/docusaurus/docs/cloud/connect/connect_databrickssql.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
sidebar_label: 'Connect GX Cloud to Databricks SQL'
title: 'Connect GX Cloud to Databricks SQL'
description: Connect GX Cloud to a Databricks SQL Data Source.
---

import TabItem from '@theme/TabItem';
import Tabs from '@theme/Tabs';

## Prerequisites

- You have a [GX Cloud account](https://greatexpectations.io/cloud) with [Admin or Editor permissions](../about_gx.md#roles-and-responsibilities).

- You have a Databricks SQL catalog, schema, and table.

- To improve data security, GX recommends creating a separate Databricks SQL [service principal](https://docs.databricks.com/en/admin/users-groups/service-principals.html#manage-service-principals-in-your-account) for your GX Cloud connection.


## Connect to a Databricks SQL Data Asset

1. In GX Cloud, click **Data Assets** > **New Data Asset** > **Databricks SQL**.

2. Enter a meaningful name for the Data Source in the **Data Source name** field.

3. Enter a connection string in the **Connection string** field. The connection string format is `databricks://token:{token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}`.
- Check the instructions to create a GX-specific user in your Databricks SQL catalog by clicking "See instructions"

4. Click **Connect**.

5. Select tables to import as Data Assets:

- Check the box next to a table name to add that table as an asset.

- At least one table must be added.

- To search for a specific table type the table's name in the Search box above the list of tables.

- To add all of the available tables check the box for All Tables.

6. Click **Add Asset**.

7. Create an Expectation. See [Create an Expectation](/cloud/expectations/manage_expectations.md#create-an-expectation).
1 change: 1 addition & 0 deletions docs/docusaurus/docs/cloud/connect/connect_lp.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import OverviewCard from '@site/src/components/OverviewCard';
<LinkCardGrid>
<LinkCard topIcon label="Connect GX Cloud to PostgreSQL" description="Quickly start using GX Cloud with PostgreSQL." to="/cloud/connect/connect_postgresql" icon="/img/postgresql_icon.svg" />
<LinkCard topIcon label="Connect GX Cloud to Snowflake" description="Quickly start using GX Cloud with Snowflake." to="/cloud/connect/connect_snowflake" icon="/img/snowflake_icon.png" />
<LinkCard topIcon label="Connect GX Cloud to Databricks SQL" description="Quickly start using GX Cloud with Databricks SQL." to="/cloud/connect/connect_databrickssql" icon="/img/databricks_icon.svg" />
<LinkCard topIcon label="Connect GX Cloud and Airflow" description="Use Airflow to run scheduled GX Cloud validations." to="/cloud/connect/connect_airflow" icon="/img/airflow_icon.png" />
<LinkCard topIcon label="Connect to GX Cloud with Python" description="Quickly start using GX Cloud with Python." to="/cloud/connect/connect_python" icon="/img/python_icon.svg" />
</LinkCardGrid>
20 changes: 13 additions & 7 deletions docs/docusaurus/docs/cloud/expectations/manage_expectations.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ An Expectation is a verifiable assertion about your data. They make implicit ass

<!-- [//]: # (TODO: To learn more about Expectations, see Expectation.) -->

:::info Custom SQL Query Expectations

To create custom SQL query Expectations, you'll need to use the GX API. See [Customize Expectations](/core/customize_expectations/customize_expectations.md).

:::

## Prerequisites

- You have a [Data Asset](/cloud/data_assets/manage_data_assets.md#create-a-data-asset).
Expand All @@ -23,7 +17,7 @@ To create custom SQL query Expectations, you'll need to use the GX API. See [Cus
The following table lists the available GX Cloud Expectations.

| Data Quality Issue | Expectation | Description |
| ------------------ | --------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
|--------------------|-----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|
| Cardinality | `expect_column_values_to_be_unique` | Expect each column value to be unique. |
| Cardinality | `expect_compound_columns_to_be_unique` | Expect the compound columns to be unique. |
| Cardinality | `expect_select_column_values_to_be_unique_within_record` | Expect the values for each record to be unique across the columns listed. Note that records can be duplicated. |
Expand Down Expand Up @@ -69,6 +63,18 @@ The following table lists the available GX Cloud Expectations.
| Volume | `expect_table_row_count_to_equal` | Expect the number of rows to equal a value. |
| Volume | `expect_table_row_count_to_equal_other_table` | Expect the number of rows to equal the number in another table within the same database. |

## Custom SQL Expectations

GX Cloud also offers the ability to write a custom Expectation using SQL. It is designed to fail validation if the provided SQL query returns one or more rows.

The provided query should be written in the dialect of the Data Source in which a given Data Asset lives.

:::info Optional `{batch}` named query

The optional `{batch}` named query references the Batch of data under test. When the Expectation is evaluated, the `{batch}` named query will be replaced with the Batch of data that is validated.

:::

## Add an Expectation

1. In GX Cloud, click **Data Assets**.
Expand Down
2 changes: 1 addition & 1 deletion docs/docusaurus/docs/components/_data.jsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
export default {
release_version: 'great_expectations, version 1.0.3',
release_version: 'great_expectations, version 1.0.5',
min_python: '3.8',
max_python: '3.11'
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import PrereqPreconfiguredDataSourceAndAsset from '../_core_components/prerequis

Among the available Expectations, the `UnexpectedRowsExpectation` is designed to facilitate the execution of SQL or Spark-SQL queries as the core logic for an Expectation. By default, `UnexpectedRowsExpectation` considers validation successful when no rows are returned by the provided SQL query.

You customize an `UnexpectedRowsExpectation` in essentially the same manner as you would [define a custom Expectation](/core/customize_expectations/define_a_custom_expectation_class.md), by subclassing `UnexpectedRowsExpectation` and providing customized default attributes and text for Data Docs. However, there are some caveats around the `UnexpectedRowsExpectation`'s `unexpected_rows_query` attribute that deserve further detail.
Like any other Expectation, you can instantiate the `UnexpectedRowsExpectation` directly. You can also customize an `UnexpectedRowsExpectation` in essentially the same manner as you would [define a custom Expectation](/core/customize_expectations/define_a_custom_expectation_class.md), by subclassing `UnexpectedRowsExpectation` and providing customized default attributes and text for Data Docs. However, there are some caveats around the `UnexpectedRowsExpectation`'s `unexpected_rows_query` attribute that deserve further detail.

<!-- TODO: Do we want to discuss custom `_validate(...)` logic here, or should that be held for a future topic on building custom Expectation classes from scratch? -->

Expand Down Expand Up @@ -48,7 +48,7 @@ You customize an `UnexpectedRowsExpectation` in essentially the same manner as y

The `unexpected_rows_query` attribute is a SQL or Spark-SQL query that returns a selection of rows from the Batch of data being validated. By default, rows that are returned have failed the validation check.

Although the `unexpected_rows_query` should be written in standard SQL or Spark-SQL syntax, it must also contain the special `{batch}` placeholder. When the Expectation is evaluated, the `{batch}` placeholder will be replaced with the Batch of data that is validated.
The `unexpected_rows_query` should be written in standard SQL or Spark-SQL syntax, except that it can also contain the special `{batch}` named query. When the Expectation is evaluated, the `{batch}` keyword will be replaced with the Batch of data that is configured for your Data Asset.

In this example, `unexpected_rows_query` will select any rows where the passenger count is greater than `6` or less than `0`. These rows will fail validation for this Expectation:

Expand Down
18 changes: 18 additions & 0 deletions docs/docusaurus/docs/oss/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ When we deprecate our public APIs, we will

Before we completely remove the functionality in a new major release, there will be at least one minor release that contains the deprecation so that you can smoothly transition to the new API.

### 1.0.5
* [BUGFIX] Using `{batch}` keyword in `UnexpectedRowsQuery` ([#10392](https://github.com/great-expectations/great_expectations/pull/10392))
* [BUGFIX] Fix Databricks SQL Regex and Like based Expectations ([#10406](https://github.com/great-expectations/great_expectations/pull/10406))
* [BUGFIX] Support Spark connect dataframes ([#10420](https://github.com/great-expectations/great_expectations/pull/10420))
* [BUGFIX] Handle DatabricksSQL attribute error and update dependency ([#10424](https://github.com/great-expectations/great_expectations/pull/10424))
* [DOCS] Add Connect to Databricks SQL page in GX Cloud ([#10394](https://github.com/great-expectations/great_expectations/pull/10394)) (thanks @allisongx)
* [DOCS] Changelog updates `0.18.18` -> `0.18.21` ([#10422](https://github.com/great-expectations/great_expectations/pull/10422))
* [DOCS] Add Connect to Databricks SQL to GX Cloud docs TOC ([#10423](https://github.com/great-expectations/great_expectations/pull/10423))
* [MAINTENANCE] Fix `SQLAlchemyExectionEngine.get_connection()` typing + update column identifier tests ([#10399](https://github.com/great-expectations/great_expectations/pull/10399))
* [MAINTENANCE] Move FabricPowerBIDatasource out of experimental dir ([#10419](https://github.com/great-expectations/great_expectations/pull/10419))

### 1.0.4
* [BUGFIX] Fix action equality ([#10393](https://github.com/great-expectations/great_expectations/pull/10393))
* [BUGFIX] Patch additional issues with data docs page retrieval in checkpoint actions ([#10400](https://github.com/great-expectations/great_expectations/pull/10400))
* [DOCS] Add CTAs to request a demo ([#10389](https://github.com/great-expectations/great_expectations/pull/10389))
* [MAINTENANCE] Ensure that all nested validation definition diagnostics are emitted from a parent checkpoint ([#10386](https://github.com/great-expectations/great_expectations/pull/10386))
* [MAINTENANCE] Fix `SQLAlchemyExectionEngine.get_connection()` typing + update column identifier tests ([#10399](https://github.com/great-expectations/great_expectations/pull/10399))

### 1.0.3
* [FEATURE] Replace get_batch_list_from_batch_request with get_batch and get_batch_identifiers_list ([#10295](https://github.com/great-expectations/great_expectations/pull/10295))
* [FEATURE] Add Checkpoint.run analytics ([#10382](https://github.com/great-expectations/great_expectations/pull/10382))
Expand Down
10 changes: 8 additions & 2 deletions docs/docusaurus/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,10 @@ module.exports = {
lastVersion: 'current',
versions: {
current: {
label: '1.0.3',
label: '1.0.5',
},
['0.18']: {
label: '0.18.17',
label: '0.18.21',
},
},
admonitions: {
Expand All @@ -325,6 +325,12 @@ module.exports = {
// Optional fields.
anonymizeIP: true, // Should IPs be anonymized?
},
sitemap: {
ignorePatterns: [
'**/0.18/oss/templates/**',
'**/0.18/oss/team_templates/**'
],
}
},
],
],
Expand Down
6 changes: 6 additions & 0 deletions docs/docusaurus/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ module.exports = {
items: [
'cloud/connect/connect_postgresql',
'cloud/connect/connect_snowflake',
'cloud/connect/connect_databrickssql',
'cloud/connect/connect_airflow',
'cloud/connect/connect_python',
]
Expand Down Expand Up @@ -251,6 +252,11 @@ module.exports = {
label: 'Available Expectations',
href: '/docs/cloud/expectations/manage_expectations#available-expectations',
},
{
type: 'link',
label: 'Custom SQL Expectations',
href: '/docs/cloud/expectations/manage_expectations#custom-sql-expectations',
},
{
type: 'link',
label: 'Add an Expectation',
Expand Down
25 changes: 25 additions & 0 deletions docs/docusaurus/versioned_docs/version-0.18/oss/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,31 @@ title: Changelog
- Deprecation warnings are accompanied by a moniker (as a code comment) indicating when they were deprecated. For example: `# deprecated-v0.13`
- Changes to methods and parameters due to deprecation are also noted in the relevant docstrings.

### 0.18.21
* [BUGFIX] Using `{batch}` keyword in `UnexpectedRowsQuery` (#10392) ([#10411](https://github.com/great-expectations/great_expectations/pull/10411))
* [BUGFIX] 0.18.x Ignore unsupported INTERVAL type as part of CDM ([#10414](https://github.com/great-expectations/great_expectations/pull/10414))
* [BUGFIX] 0.18.x Databricks SQL Pattern Expectation Fix ([#10415](https://github.com/great-expectations/great_expectations/pull/10415))
* [MAINTENANCE] Pass `description` from `Validator` to `ExpectationConfiguration` ([#10388](https://github.com/great-expectations/great_expectations/pull/10388))

### 0.18.20
* [FEATURE] Add `UnexpectedRowsExpectation` ([#10342](https://github.com/great-expectations/great_expectations/pull/10342))
* [BUGFIX] Remove illegible duplicate local Data Docs link from Slack renderer ([#10129](https://github.com/great-expectations/great_expectations/pull/10129))
* [MAINTENANCE] Ruff 0.5.3 + PR annotations ([#10128](https://github.com/great-expectations/great_expectations/pull/10128))
* [MAINTENANCE] Fix 0.18.x CI ([#10199](https://github.com/great-expectations/great_expectations/pull/10199))
* [MAINTENANCE] Update column identifier tests ([#8783](https://github.com/great-expectations/great_expectations/pull/8783))

### 0.18.19
* [FEATURE] Snowflake test for the presence of a schema in `test_connection()` ([#10100](https://github.com/great-expectations/great_expectations/pull/10100))
* [BUGFIX] Z-score renderer when `double_sided` ([#10085](https://github.com/great-expectations/great_expectations/pull/10085))
* [BUGFIX] SQLDatasource - lowercase unquoted `schema_names` for SQLAlchemy case-sensitivity compatibility ([#10107](https://github.com/great-expectations/great_expectations/pull/10107))
* [MAINTENANCE] Export `great_expectations.compatibility` types ([#10089](https://github.com/great-expectations/great_expectations/pull/10089))
* [MAINTENANCE] 0.18.x - mypy - `possibly-undefined` ([#10091](https://github.com/great-expectations/great_expectations/pull/10091))
* [MAINTENANCE] loosen ruamel pin ([#10081](https://github.com/great-expectations/great_expectations/pull/10081))

### 0.18.18
* [FEATURE] Add atomic renderer for `ExpectMulticolumnSumToEqual` (#10076) ([#10077](https://github.com/great-expectations/great_expectations/pull/10077))
* [FEATURE] Snowflake - narrow Account Identifier regex ([#10069](https://github.com/great-expectations/great_expectations/pull/10069))
* [FEATURE] Add missing atomic renderers to Expectations (#10079) ([#10080](https://github.com/great-expectations/great_expectations/pull/10080))

### 0.18.17
* [FEATURE] Snowflake - Better Account Identifier related TestConnectionErrors ([#10043](https://github.com/great-expectations/great_expectations/pull/10043))
Expand Down
2 changes: 1 addition & 1 deletion great_expectations/compatibility/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
)

try:
from databricks import connect # type: ignore[import-untyped]
from databricks import connect
except ImportError:
connect = DATABRICKS_CONNECT_NOT_IMPORTED
5 changes: 5 additions & 0 deletions great_expectations/compatibility/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
except (ImportError, AttributeError):
Column = SPARK_NOT_IMPORTED # type: ignore[assignment,misc]

try:
from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
except (ImportError, AttributeError):
ConnectDataFrame = SPARK_NOT_IMPORTED # type: ignore[assignment,misc]

try:
from pyspark.sql import DataFrame
except (ImportError, AttributeError):
Expand Down
2 changes: 1 addition & 1 deletion great_expectations/datasource/fluent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
from great_expectations.datasource.fluent.pandas_azure_blob_storage_datasource import (
PandasAzureBlobStorageDatasource,
)
from great_expectations.experimental.datasource.fabric import FabricPowerBIDatasource
from great_expectations.datasource.fluent.fabric import FabricPowerBIDatasource
from great_expectations.datasource.fluent.postgres_datasource import (
PostgresDatasource,
)
Expand Down
19 changes: 14 additions & 5 deletions great_expectations/datasource/fluent/spark_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pprint import pformat as pf
from typing import (
TYPE_CHECKING,
Any,
ClassVar,
Dict,
Generic,
Expand All @@ -27,7 +28,7 @@
StrictInt,
StrictStr,
)
from great_expectations.compatibility.pyspark import DataFrame, pyspark
from great_expectations.compatibility.pyspark import ConnectDataFrame, DataFrame, pyspark
from great_expectations.compatibility.typing_extensions import override
from great_expectations.core import IDDict
from great_expectations.core.batch import LegacyBatchDefinition
Expand All @@ -47,7 +48,7 @@
from great_expectations.exceptions.exceptions import BuildBatchRequestError

if TYPE_CHECKING:
from typing_extensions import TypeAlias
from typing_extensions import TypeAlias, TypeGuard

from great_expectations.compatibility.pyspark import SparkSession
from great_expectations.core.batch_definition import BatchDefinition
Expand Down Expand Up @@ -231,9 +232,9 @@ def build_batch_request(
if not (options is not None and "dataframe" in options and len(options) == 1):
raise BuildBatchRequestError(message="options must contain exactly 1 key, 'dataframe'.")

if not isinstance(options["dataframe"], DataFrame):
if not self.is_spark_data_frame(options["dataframe"]):
raise BuildBatchRequestError(
message="Can not build batch request for dataframe asset " "without a dataframe."
message="Cannot build batch request without a Spark DataFrame."
)

return BatchRequest(
Expand All @@ -255,7 +256,7 @@ def _validate_batch_request(self, batch_request: BatchRequest) -> None:
and batch_request.options
and len(batch_request.options) == 1
and "dataframe" in batch_request.options
and isinstance(batch_request.options["dataframe"], DataFrame)
and self.is_spark_data_frame(batch_request.options["dataframe"])
):
expect_batch_request_form = BatchRequest[None](
datasource_name=self.datasource.name,
Expand Down Expand Up @@ -314,6 +315,14 @@ def add_batch_definition_whole_dataframe(self, name: str) -> BatchDefinition:
partitioner=None,
)

@staticmethod
def is_spark_data_frame(df: Any) -> TypeGuard[Union[DataFrame, ConnectDataFrame]]:
"""Check that a given object is a Spark DataFrame.
This could either be a regular Spark DataFrame or a Spark Connect DataFrame.
"""
data_frame_types = [DataFrame, ConnectDataFrame]
return any((cls and isinstance(df, cls)) for cls in data_frame_types)


@public_api
class SparkDatasource(_SparkDatasource):
Expand Down
2 changes: 1 addition & 1 deletion great_expectations/deployment_version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.3
1.0.5
Loading

0 comments on commit b3c9c56

Please sign in to comment.