From 167fbac6102a1fcf4622d9247d92d44cb1467f0e Mon Sep 17 00:00:00 2001 From: William Shin Date: Mon, 16 Dec 2024 18:18:58 -0800 Subject: [PATCH 01/19] adding databricks types --- .../compatibility/databricks.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index ad0d5072bf5a..9497b8bef493 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -1,3 +1,5 @@ +from sqlalchemy import BIGINT + from great_expectations.compatibility.not_imported import NotImported DATABRICKS_CONNECT_NOT_IMPORTED = NotImported( @@ -8,3 +10,106 @@ from databricks import connect except ImportError: connect = DATABRICKS_CONNECT_NOT_IMPORTED +# The following types are modeled after the following documentation that is part +# of the databricks package. +# tldr: SQLAlchemy application should (mostly) "just work" with Databricks. +# https://github.com/databricks/databricks-sql-python/blob/main/src/databricks/sqlalchemy/README.sqlalchemy.md + +try: + # Importing ENUM of every Databricks SQL Type that is shown here + # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html + from databricks.sql.parameters.native import DatabricksSupportedType +except ImportError: + DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + BIGINT = DatabricksSupportedType.BIGINT +except (ImportError, AttributeError): + BIGINT = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + BOOLEAN = DatabricksSupportedType.BOOLEAN +except (ImportError, AttributeError): + BOOLEAN = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + DATE = DatabricksSupportedType.DATE +except (ImportError, AttributeError): + DATE = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + from databricks.sqlalchemy._types import TIMESTAMP_NTZ as TIMESTAMP_NTZ # noqa: PLC0414, RUF100 +except (ImportError, AttributeError): + TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + DOUBLE = DatabricksSupportedType.DOUBLE +except (ImportError, AttributeError): + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + FLOAT = DatabricksSupportedType.FLOAT +except (ImportError, AttributeError): + FLOAT = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + INT = DatabricksSupportedType.INT +except (ImportError, AttributeError): + INT = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + DECIMAL = DatabricksSupportedType.DECIMAL +except (ImportError, AttributeError): + DECIMAL = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + SMALLINT = DatabricksSupportedType.SMALLINT +except (ImportError, AttributeError): + SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + INTERVAL = DatabricksSupportedType.INTERVAL +except (ImportError, AttributeError): + INTERVAL = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + VOID = DatabricksSupportedType.VOID +except (ImportError, AttributeError): + VOID = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + SMALLINT = DatabricksSupportedType.SMALLINT +except (ImportError, AttributeError): + SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 +except (ImportError, AttributeError): + STRING = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + from databricks.sqlalchemy._types import TIMESTAMP as TIMESTAMP # noqa: PLC0414, RUF100 +except (ImportError, AttributeError): + TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED + +try: + from databricks.sqlalchemy._types import TINYINT as TINYINT # noqa: PLC0414, RUF100 +except (ImportError, AttributeError): + TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED + + +class DATABRICKS_TYPES: + """Namespace for Databricks dialect types""" + + BIGINT = BIGINT + BOOLEAN = BOOLEAN + DATE = DATE + TIMESTAMP_NTZ = TIMESTAMP_NTZ + DOUBLE = DOUBLE + FLOAT = FLOAT + INT = INT + DECIMAL = DECIMAL + SMALLINT = SMALLINT + STRING = STRING + TIMESTAMP = TIMESTAMP + TINYINT = TINYINT From c18c9e52f2ae0b878afc1966e0cbcc0ae4fa82e5 Mon Sep 17 00:00:00 2001 From: William Shin Date: Mon, 16 Dec 2024 18:21:01 -0800 Subject: [PATCH 02/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 9497b8bef493..f2c861fea1e7 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -1,5 +1,3 @@ -from sqlalchemy import BIGINT - from great_expectations.compatibility.not_imported import NotImported DATABRICKS_CONNECT_NOT_IMPORTED = NotImported( From 51df6f7b32f8c2285f45193950af1da7827f1f5c Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 12:05:43 -0800 Subject: [PATCH 03/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index f2c861fea1e7..e50212163b07 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from great_expectations.compatibility.not_imported import NotImported DATABRICKS_CONNECT_NOT_IMPORTED = NotImported( @@ -12,7 +14,6 @@ # of the databricks package. # tldr: SQLAlchemy application should (mostly) "just work" with Databricks. # https://github.com/databricks/databricks-sql-python/blob/main/src/databricks/sqlalchemy/README.sqlalchemy.md - try: # Importing ENUM of every Databricks SQL Type that is shown here # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html From b05b84c16a2f340fce8aad7cb3bc9df32692e988 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 15:16:04 -0800 Subject: [PATCH 04/19] type ignore --- .../compatibility/databricks.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index e50212163b07..11210a341db7 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -24,77 +24,77 @@ try: BIGINT = DatabricksSupportedType.BIGINT except (ImportError, AttributeError): - BIGINT = DATABRICKS_CONNECT_NOT_IMPORTED + BIGINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: BOOLEAN = DatabricksSupportedType.BOOLEAN except (ImportError, AttributeError): - BOOLEAN = DATABRICKS_CONNECT_NOT_IMPORTED + BOOLEAN = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: DATE = DatabricksSupportedType.DATE except (ImportError, AttributeError): - DATE = DATABRICKS_CONNECT_NOT_IMPORTED + DATE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: from databricks.sqlalchemy._types import TIMESTAMP_NTZ as TIMESTAMP_NTZ # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED + TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: DOUBLE = DatabricksSupportedType.DOUBLE except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: FLOAT = DatabricksSupportedType.FLOAT except (ImportError, AttributeError): - FLOAT = DATABRICKS_CONNECT_NOT_IMPORTED + FLOAT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: INT = DatabricksSupportedType.INT except (ImportError, AttributeError): - INT = DATABRICKS_CONNECT_NOT_IMPORTED + INT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: DECIMAL = DatabricksSupportedType.DECIMAL except (ImportError, AttributeError): - DECIMAL = DATABRICKS_CONNECT_NOT_IMPORTED + DECIMAL = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: SMALLINT = DatabricksSupportedType.SMALLINT except (ImportError, AttributeError): - SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED + SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: INTERVAL = DatabricksSupportedType.INTERVAL except (ImportError, AttributeError): - INTERVAL = DATABRICKS_CONNECT_NOT_IMPORTED + INTERVAL = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: VOID = DatabricksSupportedType.VOID except (ImportError, AttributeError): - VOID = DATABRICKS_CONNECT_NOT_IMPORTED + VOID = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: SMALLINT = DatabricksSupportedType.SMALLINT except (ImportError, AttributeError): - SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED + SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - STRING = DATABRICKS_CONNECT_NOT_IMPORTED + STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: from databricks.sqlalchemy._types import TIMESTAMP as TIMESTAMP # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED + TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: from databricks.sqlalchemy._types import TINYINT as TINYINT # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED + TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] class DATABRICKS_TYPES: From af769b14b0b714b0f022ae20149b9e1dc53c4f5f Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 15:26:35 -0800 Subject: [PATCH 05/19] a little more general --- great_expectations/compatibility/databricks.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 11210a341db7..dba3470916a0 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -6,10 +6,7 @@ "databricks-connect is not installed, please 'pip install databricks-connect'" ) -try: - from databricks import connect -except ImportError: - connect = DATABRICKS_CONNECT_NOT_IMPORTED + # The following types are modeled after the following documentation that is part # of the databricks package. # tldr: SQLAlchemy application should (mostly) "just work" with Databricks. @@ -19,7 +16,7 @@ # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html from databricks.sql.parameters.native import DatabricksSupportedType except ImportError: - DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED + DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: BIGINT = DatabricksSupportedType.BIGINT @@ -44,7 +41,7 @@ try: DOUBLE = DatabricksSupportedType.DOUBLE except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] try: FLOAT = DatabricksSupportedType.FLOAT @@ -84,17 +81,17 @@ try: from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] + STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, string] try: from databricks.sqlalchemy._types import TIMESTAMP as TIMESTAMP # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] + TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] try: from databricks.sqlalchemy._types import TINYINT as TINYINT # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] + TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] class DATABRICKS_TYPES: From ce007c5d911b9a8d490d03b90f776b0f854739da Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 15:35:12 -0800 Subject: [PATCH 06/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index dba3470916a0..8de87d2fd400 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -36,12 +36,12 @@ try: from databricks.sqlalchemy._types import TIMESTAMP_NTZ as TIMESTAMP_NTZ # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] + TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] try: DOUBLE = DatabricksSupportedType.DOUBLE except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] try: FLOAT = DatabricksSupportedType.FLOAT @@ -81,17 +81,17 @@ try: from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, string] + STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] try: from databricks.sqlalchemy._types import TIMESTAMP as TIMESTAMP # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] + TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] try: from databricks.sqlalchemy._types import TINYINT as TINYINT # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment, misc] + TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] class DATABRICKS_TYPES: From 03e39c29c350b467d5e94befb0e50fcea710bdfb Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 15:52:27 -0800 Subject: [PATCH 07/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 8de87d2fd400..67dd37a73ce3 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -36,7 +36,7 @@ try: from databricks.sqlalchemy._types import TIMESTAMP_NTZ as TIMESTAMP_NTZ # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] + TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] try: DOUBLE = DatabricksSupportedType.DOUBLE @@ -81,17 +81,17 @@ try: from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] + STRING = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] try: from databricks.sqlalchemy._types import TIMESTAMP as TIMESTAMP # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] + TIMESTAMP = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] try: from databricks.sqlalchemy._types import TINYINT as TINYINT # noqa: PLC0414, RUF100 except (ImportError, AttributeError): - TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] + TINYINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] class DATABRICKS_TYPES: From f5ccb4ccf9c0bfb46932db955e4b5c5c0fa392f4 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:04:34 -0800 Subject: [PATCH 08/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 67dd37a73ce3..2b67e6c0422a 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -16,8 +16,7 @@ # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html from databricks.sql.parameters.native import DatabricksSupportedType except ImportError: - DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - + DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] try: BIGINT = DatabricksSupportedType.BIGINT except (ImportError, AttributeError): @@ -41,7 +40,7 @@ try: DOUBLE = DatabricksSupportedType.DOUBLE except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc] + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] try: FLOAT = DatabricksSupportedType.FLOAT From 24679251e45386477d5ac685fb2465df90a7f203 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:32:00 -0800 Subject: [PATCH 09/19] Update test_expect_column_values_to_be_in_type_list.py --- ...expect_column_values_to_be_in_type_list.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index c893f71327b9..348c1e251357 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -3,6 +3,7 @@ import sqlalchemy.types as sqltypes import great_expectations.expectations as gxe +from great_expectations.compatibility.databricks import DATABRICKS_TYPES from great_expectations.compatibility.snowflake import SNOWFLAKE_TYPES from great_expectations.core.result_format import ResultFormat from great_expectations.datasource.fluent.interfaces import Batch @@ -379,3 +380,42 @@ def test_success_complete_snowflake( assert isinstance(result_dict["observed_value"], str) assert isinstance(expectation.type_list, list) assert result_dict["observed_value"] in expectation.type_list + + +@pytest.mark.parametrize( + "expectation", + [ + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="STRING", type_list=["STRING", "VARCHAR(16777216)"] + ), + id="STRING", + ) + ], +) +@parameterize_batch_for_data_sources( + data_source_configs=[ + SnowflakeDatasourceTestConfig( + column_types={ + "STRING": DATABRICKS_TYPES.STRING, + } + ) + ], + data=pd.DataFrame( + { + "STRING": ["a", "b", "c"], + }, + dtype="object", + ), +) +def test_success_complete_databricks( + batch_for_datasource: Batch, expectation: gxe.ExpectColumnValuesToBeInTypeList +) -> None: + result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE) + result_dict = result.to_json_dict()["result"] + + assert result.success + assert isinstance(result_dict, dict) + assert isinstance(result_dict["observed_value"], str) + assert isinstance(expectation.type_list, list) + assert result_dict["observed_value"] in expectation.type_list From 806648fb79eec4bf6c0ff4db44ce3573610bbefc Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:32:30 -0800 Subject: [PATCH 10/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 2b67e6c0422a..cee600beae62 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -40,7 +40,7 @@ try: DOUBLE = DatabricksSupportedType.DOUBLE except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] + DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: FLOAT = DatabricksSupportedType.FLOAT From 9d6f5105cc3b47e34da8f76a9e28e13c8a21c99c Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:43:52 -0800 Subject: [PATCH 11/19] the fix --- .../expect_column_values_to_be_in_type_list.py | 2 +- .../core/expect_column_values_to_be_of_type.py | 2 +- great_expectations/expectations/metrics/util.py | 15 +++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py index 0f9d82e42771..e0bd10f36942 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py @@ -458,7 +458,7 @@ def _validate_pandas( # noqa: C901, PLR0912 def _validate_sqlalchemy(self, actual_column_type, expected_types_list, execution_engine): if expected_types_list is None: success = True - elif execution_engine.dialect_name == GXSqlDialect.SNOWFLAKE: + elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: success = isinstance(actual_column_type, str) and any( actual_column_type.lower() == expected_type.lower() for expected_type in expected_types_list diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index 5429c3979882..647053760d6d 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -412,7 +412,7 @@ def _validate_sqlalchemy(self, actual_column_type, expected_type, execution_engi if expected_type is None: success = True - elif execution_engine.dialect_name == GXSqlDialect.SNOWFLAKE: + elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: success = ( isinstance(actual_column_type, str) and actual_column_type.lower() == expected_type.lower() diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index c3dc274bfc94..c89f2e870477 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -414,16 +414,19 @@ def get_sqlalchemy_column_metadata( # noqa: C901 ) dialect_name = execution_engine.dialect.name - if dialect_name == GXSqlDialect.SNOWFLAKE: + if dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: # WARNING: Do not alter columns in place, as they are cached on the inspector columns_copy = [column.copy() for column in columns] for column in columns_copy: column["type"] = column["type"].compile(dialect=execution_engine.dialect) - return [ - # TODO: SmartColumn should know the dialect and do lookups based on that - CaseInsensitiveNameDict(column) - for column in columns_copy - ] + if dialect_name == GXSqlDialect.SNOWFLAKE: + return [ + # TODO: SmartColumn should know the dialect and do lookups based on that + CaseInsensitiveNameDict(column) + for column in columns_copy + ] + else: + return columns_copy return columns except AttributeError as e: From 0d68ca28c845d3473e08df4df810971f51317e0e Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:47:39 -0800 Subject: [PATCH 12/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index cee600beae62..1a2709afef20 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -6,6 +6,10 @@ "databricks-connect is not installed, please 'pip install databricks-connect'" ) +try: + from databricks import connect +except ImportError: + connect = DATABRICKS_CONNECT_NOT_IMPORTED # The following types are modeled after the following documentation that is part # of the databricks package. From d6cf5dd74ff7a45c6a64d507aa3fc569e0e2947a Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:57:18 -0800 Subject: [PATCH 13/19] adding test --- .../test_expect_column_values_to_be_in_type_list.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index 348c1e251357..a85a582ed6bd 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -386,18 +386,17 @@ def test_success_complete_snowflake( "expectation", [ pytest.param( - gxe.ExpectColumnValuesToBeInTypeList( - column="STRING", type_list=["STRING", "VARCHAR(16777216)"] - ), + gxe.ExpectColumnValuesToBeInTypeList(column="STRING", type_list=["STRING"]), id="STRING", ) ], ) @parameterize_batch_for_data_sources( data_source_configs=[ - SnowflakeDatasourceTestConfig( + DatabricksDatasourceTestConfig( column_types={ "STRING": DATABRICKS_TYPES.STRING, + "": DATABRICKS_TYPES.STRING, } ) ], @@ -417,5 +416,6 @@ def test_success_complete_databricks( assert result.success assert isinstance(result_dict, dict) assert isinstance(result_dict["observed_value"], str) + print(f"result: {result_dict['observed_value']}") assert isinstance(expectation.type_list, list) assert result_dict["observed_value"] in expectation.type_list From c0f90ea919a2338b5c23df05be9f41276c23cb57 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 16:57:42 -0800 Subject: [PATCH 14/19] a better import --- great_expectations/compatibility/databricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 1a2709afef20..2be3de1e07b4 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -8,7 +8,7 @@ try: from databricks import connect -except ImportError: +except (ImportError, AttributeError): connect = DATABRICKS_CONNECT_NOT_IMPORTED # The following types are modeled after the following documentation that is part From 9b15bb5205a40c5d313116abfe9370b1fe195503 Mon Sep 17 00:00:00 2001 From: William Shin Date: Tue, 17 Dec 2024 22:13:16 -0800 Subject: [PATCH 15/19] tests and new types --- .../compatibility/databricks.py | 73 +--------- ...expect_column_values_to_be_in_type_list.py | 126 +++++++++++++++++- 2 files changed, 126 insertions(+), 73 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 2be3de1e07b4..08f561ed0630 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -13,74 +13,15 @@ # The following types are modeled after the following documentation that is part # of the databricks package. -# tldr: SQLAlchemy application should (mostly) "just work" with Databricks. +# tldr: SQLAlchemy application should (mostly) "just work" with Databricks, +# other than the exceptions below # https://github.com/databricks/databricks-sql-python/blob/main/src/databricks/sqlalchemy/README.sqlalchemy.md -try: - # Importing ENUM of every Databricks SQL Type that is shown here - # https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html - from databricks.sql.parameters.native import DatabricksSupportedType -except ImportError: - DatabricksSupportedType = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] -try: - BIGINT = DatabricksSupportedType.BIGINT -except (ImportError, AttributeError): - BIGINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - BOOLEAN = DatabricksSupportedType.BOOLEAN -except (ImportError, AttributeError): - BOOLEAN = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - DATE = DatabricksSupportedType.DATE -except (ImportError, AttributeError): - DATE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] try: from databricks.sqlalchemy._types import TIMESTAMP_NTZ as TIMESTAMP_NTZ # noqa: PLC0414, RUF100 except (ImportError, AttributeError): TIMESTAMP_NTZ = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[misc, assignment] -try: - DOUBLE = DatabricksSupportedType.DOUBLE -except (ImportError, AttributeError): - DOUBLE = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - FLOAT = DatabricksSupportedType.FLOAT -except (ImportError, AttributeError): - FLOAT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - INT = DatabricksSupportedType.INT -except (ImportError, AttributeError): - INT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - DECIMAL = DatabricksSupportedType.DECIMAL -except (ImportError, AttributeError): - DECIMAL = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - SMALLINT = DatabricksSupportedType.SMALLINT -except (ImportError, AttributeError): - SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - INTERVAL = DatabricksSupportedType.INTERVAL -except (ImportError, AttributeError): - INTERVAL = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - VOID = DatabricksSupportedType.VOID -except (ImportError, AttributeError): - VOID = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - -try: - SMALLINT = DatabricksSupportedType.SMALLINT -except (ImportError, AttributeError): - SMALLINT = DATABRICKS_CONNECT_NOT_IMPORTED # type: ignore[assignment] - try: from databricks.sqlalchemy._types import DatabricksStringType as STRING # noqa: PLC0414, RUF100 except (ImportError, AttributeError): @@ -100,15 +41,7 @@ class DATABRICKS_TYPES: """Namespace for Databricks dialect types""" - BIGINT = BIGINT - BOOLEAN = BOOLEAN - DATE = DATE TIMESTAMP_NTZ = TIMESTAMP_NTZ - DOUBLE = DOUBLE - FLOAT = FLOAT - INT = INT - DECIMAL = DECIMAL - SMALLINT = SMALLINT STRING = STRING - TIMESTAMP = TIMESTAMP TINYINT = TINYINT + TIMESTAMP = TIMESTAMP diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index a85a582ed6bd..2aa9a9e73064 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -388,7 +388,78 @@ def test_success_complete_snowflake( pytest.param( gxe.ExpectColumnValuesToBeInTypeList(column="STRING", type_list=["STRING"]), id="STRING", - ) + ), + # SqlA Text gets converted to Databricks STRING + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="TEXT", type_list=["STRING"]), + id="TEXT", + ), + # SqlA Time gets converted to Databricks STRING + # pytest.param( + # gxe.ExpectColumnValuesToBeInTypeList(column="TIME", type_list=["STRING"]), + # id="TIME", + # ), + # SqlA UNICODE gets converted to Databricks STRING + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE", type_list=["STRING"]), + id="UNICODE", + ), + # SqlA UNICODE_TEXT gets converted to Databricks STRING + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE_TEXT", type_list=["STRING"]), + id="UNICODE_TEXT", + ), + # SqlA UUID gets converted to Databricks STRING + # pytest.param( + # gxe.ExpectColumnValuesToBeInTypeList(column="UUID", type_list=["STRING"]), + # id="UUID", + # ) + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="BOOLEAN", type_list=["BOOLEAN"]), + id="BOOLEAN", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="DECIMAL", type_list=["DECIMAL", "DECIMAL(10, 0)"] + ), + id="DECIMAL", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="DATE", type_list=["DATE"]), + id="DATE", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="TIMESTAMP", type_list=["TIMESTAMP"]), + id="TIMESTAMP", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="TIMESTAMP_NTZ", type_list=["TIMESTAMP_NTZ"] + ), + id="TIMESTAMP_NTZ", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="DOUBLE", type_list=["DOUBLE", "FLOAT"]), + id="DOUBLE", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="FLOAT", type_list=["FLOAT"]), + id="FLOAT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="INT", type_list=["INT"]), + id="INT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="TINYINT", type_list=["TINYINT"]), + id="TINYINT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="DECIMAL", type_list=["DECIMAL", "DECIMAL(10, 0)"] + ), + id="DECIMAL", + ), ], ) @parameterize_batch_for_data_sources( @@ -396,13 +467,63 @@ def test_success_complete_snowflake( DatabricksDatasourceTestConfig( column_types={ "STRING": DATABRICKS_TYPES.STRING, - "": DATABRICKS_TYPES.STRING, + "TEXT": sqltypes.Text, + "UNICODE": sqltypes.Unicode, + "UNICODE_TEXT": sqltypes.UnicodeText, + "BIGINT": sqltypes.BigInteger, + "BOOLEAN": sqltypes.BOOLEAN, + "DATE": sqltypes.DATE, + "TIMESTAMP_NTZ": DATABRICKS_TYPES.TIMESTAMP_NTZ, + "TIMESTAMP": DATABRICKS_TYPES.TIMESTAMP, + "DOUBLE": sqltypes.DOUBLE, + "FLOAT": sqltypes.FLOAT, + "INT": sqltypes.Integer, + "DECIMAL": sqltypes.Numeric, + "SMALLINT": sqltypes.SmallInteger, + "TINYINT": DATABRICKS_TYPES.TINYINT, + # "TIME": sqltypes.Time, + # "UUID": sqltypes.UUID, } ) ], data=pd.DataFrame( { "STRING": ["a", "b", "c"], + "TEXT": ["a", "b", "c"], + "UNICODE": ["\u00e9", "\u00e9", "\u00e9"], + "UNICODE_TEXT": ["a", "b", "c"], + "BIGINT": [1111, 2222, 3333], + "BOOLEAN": [True, True, False], + "DATE": [ + "2021-01-01", + "2021-01-02", + "2021-01-03", + ], + "TIMESTAMP_NTZ": [ + "2021-01-01 00:00:00", + "2021-01-02 00:00:00", + "2021-01-03 00:00:00", + ], + "TIMESTAMP": [ + "2021-01-01 00:00:00", + "2021-01-02 00:00:00", + "2021-01-03 00:00:00", + ], + "DOUBLE": [1.0, 2.0, 3.0], + "FLOAT": [1.0, 2.0, 3.0], + "INT": [1, 2, 3], + "DECIMAL": [1.1, 2.2, 3.3], + "SMALLINT": [1, 2, 3], + # "TIME": [ + # "00:00:00", + # "01:00:00", + # "00:10:43", + # ], + # "UUID": [ + # "905993ea-f50e-4284-bea0-5be3f0ed7031", + # "9406b631-fa2f-41cf-b666-f9a2ac3118c1", + # "47538f05-32e3-4594-80e2-0b3b33257ae7" + # ], }, dtype="object", ), @@ -416,6 +537,5 @@ def test_success_complete_databricks( assert result.success assert isinstance(result_dict, dict) assert isinstance(result_dict["observed_value"], str) - print(f"result: {result_dict['observed_value']}") assert isinstance(expectation.type_list, list) assert result_dict["observed_value"] in expectation.type_list From a87db03bf529329aa3aaccbeffef2fd7ded3f8f0 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 18 Dec 2024 09:54:38 -0800 Subject: [PATCH 16/19] Update databricks.py --- great_expectations/compatibility/databricks.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/great_expectations/compatibility/databricks.py b/great_expectations/compatibility/databricks.py index 08f561ed0630..3f18fbb6a747 100644 --- a/great_expectations/compatibility/databricks.py +++ b/great_expectations/compatibility/databricks.py @@ -6,11 +6,6 @@ "databricks-connect is not installed, please 'pip install databricks-connect'" ) -try: - from databricks import connect -except (ImportError, AttributeError): - connect = DATABRICKS_CONNECT_NOT_IMPORTED - # The following types are modeled after the following documentation that is part # of the databricks package. # tldr: SQLAlchemy application should (mostly) "just work" with Databricks, From 7dba97837e0131876f9641aa3d27688059f62c8f Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 18 Dec 2024 10:10:38 -0800 Subject: [PATCH 17/19] a bit of clean up before review. --- ...expect_column_values_to_be_in_type_list.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index 2aa9a9e73064..954e7b5032b0 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -394,11 +394,6 @@ def test_success_complete_snowflake( gxe.ExpectColumnValuesToBeInTypeList(column="TEXT", type_list=["STRING"]), id="TEXT", ), - # SqlA Time gets converted to Databricks STRING - # pytest.param( - # gxe.ExpectColumnValuesToBeInTypeList(column="TIME", type_list=["STRING"]), - # id="TIME", - # ), # SqlA UNICODE gets converted to Databricks STRING pytest.param( gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE", type_list=["STRING"]), @@ -409,11 +404,6 @@ def test_success_complete_snowflake( gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE_TEXT", type_list=["STRING"]), id="UNICODE_TEXT", ), - # SqlA UUID gets converted to Databricks STRING - # pytest.param( - # gxe.ExpectColumnValuesToBeInTypeList(column="UUID", type_list=["STRING"]), - # id="UUID", - # ) pytest.param( gxe.ExpectColumnValuesToBeInTypeList(column="BOOLEAN", type_list=["BOOLEAN"]), id="BOOLEAN", @@ -460,6 +450,18 @@ def test_success_complete_snowflake( ), id="DECIMAL", ), + # SqlA Time gets converted to Databricks STRING, + # but is not supported by our testing framework + # pytest.param( + # gxe.ExpectColumnValuesToBeInTypeList(column="TIME", type_list=["STRING"]), + # id="TIME", + # ), + # SqlA UUID gets converted to Databricks STRING, + # but is not supported by our testing framework. + # pytest.param( + # gxe.ExpectColumnValuesToBeInTypeList(column="UUID", type_list=["STRING"]), + # id="UUID", + # ) ], ) @parameterize_batch_for_data_sources( @@ -515,14 +517,14 @@ def test_success_complete_snowflake( "DECIMAL": [1.1, 2.2, 3.3], "SMALLINT": [1, 2, 3], # "TIME": [ - # "00:00:00", - # "01:00:00", - # "00:10:43", + # sa.Time("22:17:33.123456"), + # sa.Time("22:17:33.123456"), + # sa.Time("22:17:33.123456"), # ], # "UUID": [ - # "905993ea-f50e-4284-bea0-5be3f0ed7031", - # "9406b631-fa2f-41cf-b666-f9a2ac3118c1", - # "47538f05-32e3-4594-80e2-0b3b33257ae7" + # uuid.UUID("905993ea-f50e-4284-bea0-5be3f0ed7031"), + # uuid.UUID("9406b631-fa2f-41cf-b666-f9a2ac3118c1"), + # uuid.UUID("47538f05-32e3-4594-80e2-0b3b33257ae7") # ], }, dtype="object", From bdfa30e409339292bfe7620f08750a1d43da8d64 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 18 Dec 2024 10:11:48 -0800 Subject: [PATCH 18/19] Update test_expect_column_values_to_be_in_type_list.py --- ...expect_column_values_to_be_in_type_list.py | 162 ------------------ 1 file changed, 162 deletions(-) diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index 954e7b5032b0..c893f71327b9 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -3,7 +3,6 @@ import sqlalchemy.types as sqltypes import great_expectations.expectations as gxe -from great_expectations.compatibility.databricks import DATABRICKS_TYPES from great_expectations.compatibility.snowflake import SNOWFLAKE_TYPES from great_expectations.core.result_format import ResultFormat from great_expectations.datasource.fluent.interfaces import Batch @@ -380,164 +379,3 @@ def test_success_complete_snowflake( assert isinstance(result_dict["observed_value"], str) assert isinstance(expectation.type_list, list) assert result_dict["observed_value"] in expectation.type_list - - -@pytest.mark.parametrize( - "expectation", - [ - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="STRING", type_list=["STRING"]), - id="STRING", - ), - # SqlA Text gets converted to Databricks STRING - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="TEXT", type_list=["STRING"]), - id="TEXT", - ), - # SqlA UNICODE gets converted to Databricks STRING - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE", type_list=["STRING"]), - id="UNICODE", - ), - # SqlA UNICODE_TEXT gets converted to Databricks STRING - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="UNICODE_TEXT", type_list=["STRING"]), - id="UNICODE_TEXT", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="BOOLEAN", type_list=["BOOLEAN"]), - id="BOOLEAN", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList( - column="DECIMAL", type_list=["DECIMAL", "DECIMAL(10, 0)"] - ), - id="DECIMAL", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="DATE", type_list=["DATE"]), - id="DATE", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="TIMESTAMP", type_list=["TIMESTAMP"]), - id="TIMESTAMP", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList( - column="TIMESTAMP_NTZ", type_list=["TIMESTAMP_NTZ"] - ), - id="TIMESTAMP_NTZ", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="DOUBLE", type_list=["DOUBLE", "FLOAT"]), - id="DOUBLE", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="FLOAT", type_list=["FLOAT"]), - id="FLOAT", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="INT", type_list=["INT"]), - id="INT", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList(column="TINYINT", type_list=["TINYINT"]), - id="TINYINT", - ), - pytest.param( - gxe.ExpectColumnValuesToBeInTypeList( - column="DECIMAL", type_list=["DECIMAL", "DECIMAL(10, 0)"] - ), - id="DECIMAL", - ), - # SqlA Time gets converted to Databricks STRING, - # but is not supported by our testing framework - # pytest.param( - # gxe.ExpectColumnValuesToBeInTypeList(column="TIME", type_list=["STRING"]), - # id="TIME", - # ), - # SqlA UUID gets converted to Databricks STRING, - # but is not supported by our testing framework. - # pytest.param( - # gxe.ExpectColumnValuesToBeInTypeList(column="UUID", type_list=["STRING"]), - # id="UUID", - # ) - ], -) -@parameterize_batch_for_data_sources( - data_source_configs=[ - DatabricksDatasourceTestConfig( - column_types={ - "STRING": DATABRICKS_TYPES.STRING, - "TEXT": sqltypes.Text, - "UNICODE": sqltypes.Unicode, - "UNICODE_TEXT": sqltypes.UnicodeText, - "BIGINT": sqltypes.BigInteger, - "BOOLEAN": sqltypes.BOOLEAN, - "DATE": sqltypes.DATE, - "TIMESTAMP_NTZ": DATABRICKS_TYPES.TIMESTAMP_NTZ, - "TIMESTAMP": DATABRICKS_TYPES.TIMESTAMP, - "DOUBLE": sqltypes.DOUBLE, - "FLOAT": sqltypes.FLOAT, - "INT": sqltypes.Integer, - "DECIMAL": sqltypes.Numeric, - "SMALLINT": sqltypes.SmallInteger, - "TINYINT": DATABRICKS_TYPES.TINYINT, - # "TIME": sqltypes.Time, - # "UUID": sqltypes.UUID, - } - ) - ], - data=pd.DataFrame( - { - "STRING": ["a", "b", "c"], - "TEXT": ["a", "b", "c"], - "UNICODE": ["\u00e9", "\u00e9", "\u00e9"], - "UNICODE_TEXT": ["a", "b", "c"], - "BIGINT": [1111, 2222, 3333], - "BOOLEAN": [True, True, False], - "DATE": [ - "2021-01-01", - "2021-01-02", - "2021-01-03", - ], - "TIMESTAMP_NTZ": [ - "2021-01-01 00:00:00", - "2021-01-02 00:00:00", - "2021-01-03 00:00:00", - ], - "TIMESTAMP": [ - "2021-01-01 00:00:00", - "2021-01-02 00:00:00", - "2021-01-03 00:00:00", - ], - "DOUBLE": [1.0, 2.0, 3.0], - "FLOAT": [1.0, 2.0, 3.0], - "INT": [1, 2, 3], - "DECIMAL": [1.1, 2.2, 3.3], - "SMALLINT": [1, 2, 3], - # "TIME": [ - # sa.Time("22:17:33.123456"), - # sa.Time("22:17:33.123456"), - # sa.Time("22:17:33.123456"), - # ], - # "UUID": [ - # uuid.UUID("905993ea-f50e-4284-bea0-5be3f0ed7031"), - # uuid.UUID("9406b631-fa2f-41cf-b666-f9a2ac3118c1"), - # uuid.UUID("47538f05-32e3-4594-80e2-0b3b33257ae7") - # ], - }, - dtype="object", - ), -) -def test_success_complete_databricks( - batch_for_datasource: Batch, expectation: gxe.ExpectColumnValuesToBeInTypeList -) -> None: - result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE) - result_dict = result.to_json_dict()["result"] - - assert result.success - assert isinstance(result_dict, dict) - assert isinstance(result_dict["observed_value"], str) - assert isinstance(expectation.type_list, list) - assert result_dict["observed_value"] in expectation.type_list From 452500381778c30826f06827fd94aa8444f43769 Mon Sep 17 00:00:00 2001 From: William Shin Date: Wed, 18 Dec 2024 10:16:44 -0800 Subject: [PATCH 19/19] only the needed changes. --- .../expect_column_values_to_be_in_type_list.py | 2 +- .../core/expect_column_values_to_be_of_type.py | 2 +- great_expectations/expectations/metrics/util.py | 15 ++++++--------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py index e0bd10f36942..0f9d82e42771 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py @@ -458,7 +458,7 @@ def _validate_pandas( # noqa: C901, PLR0912 def _validate_sqlalchemy(self, actual_column_type, expected_types_list, execution_engine): if expected_types_list is None: success = True - elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + elif execution_engine.dialect_name == GXSqlDialect.SNOWFLAKE: success = isinstance(actual_column_type, str) and any( actual_column_type.lower() == expected_type.lower() for expected_type in expected_types_list diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index 647053760d6d..5429c3979882 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -412,7 +412,7 @@ def _validate_sqlalchemy(self, actual_column_type, expected_type, execution_engi if expected_type is None: success = True - elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + elif execution_engine.dialect_name == GXSqlDialect.SNOWFLAKE: success = ( isinstance(actual_column_type, str) and actual_column_type.lower() == expected_type.lower() diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index c89f2e870477..c3dc274bfc94 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -414,19 +414,16 @@ def get_sqlalchemy_column_metadata( # noqa: C901 ) dialect_name = execution_engine.dialect.name - if dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + if dialect_name == GXSqlDialect.SNOWFLAKE: # WARNING: Do not alter columns in place, as they are cached on the inspector columns_copy = [column.copy() for column in columns] for column in columns_copy: column["type"] = column["type"].compile(dialect=execution_engine.dialect) - if dialect_name == GXSqlDialect.SNOWFLAKE: - return [ - # TODO: SmartColumn should know the dialect and do lookups based on that - CaseInsensitiveNameDict(column) - for column in columns_copy - ] - else: - return columns_copy + return [ + # TODO: SmartColumn should know the dialect and do lookups based on that + CaseInsensitiveNameDict(column) + for column in columns_copy + ] return columns except AttributeError as e: