From ab0a7f7ce6384f1693a2e496011f81900ab38472 Mon Sep 17 00:00:00 2001 From: alena-hutchinson Date: Thu, 19 Dec 2024 09:13:18 -0800 Subject: [PATCH] [BUGFIX] Metric `table.column_type` should properly evaluate for Postgres (#10793) Co-authored-by: Thu Pham Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../schema_strict_and_relaxed.py | 4 +- .../compatibility/postgresql.py | 78 +++++++++++++ ...expect_column_values_to_be_in_type_list.py | 6 +- .../expect_column_values_to_be_of_type.py | 6 +- .../expectations/metrics/util.py | 13 ++- ...expect_column_values_to_be_in_type_list.py | 106 ++++++++++++++++++ 6 files changed, 206 insertions(+), 7 deletions(-) create mode 100644 great_expectations/compatibility/postgresql.py diff --git a/docs/docusaurus/docs/reference/learn/data_quality_use_cases/schema_resources/schema_strict_and_relaxed.py b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/schema_resources/schema_strict_and_relaxed.py index e1d8f3669ff6..49fcf65e4a39 100644 --- a/docs/docusaurus/docs/reference/learn/data_quality_use_cases/schema_resources/schema_strict_and_relaxed.py +++ b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/schema_resources/schema_strict_and_relaxed.py @@ -63,7 +63,7 @@ ) strict_suite.add_expectation( - gxe.ExpectColumnValuesToBeOfType(column="transfer_amount", type_="DOUBLE_PRECISION") + gxe.ExpectColumnValuesToBeOfType(column="transfer_amount", type_="DOUBLE PRECISION") ) strict_results = batch.validate(strict_suite) @@ -85,7 +85,7 @@ relaxed_suite.add_expectation( gxe.ExpectColumnValuesToBeInTypeList( - column="transfer_amount", type_list=["DOUBLE_PRECISION", "STRING"] + column="transfer_amount", type_list=["DOUBLE PRECISION", "STRING"] ) ) diff --git a/great_expectations/compatibility/postgresql.py b/great_expectations/compatibility/postgresql.py new file mode 100644 index 000000000000..492b73c5b502 --- /dev/null +++ b/great_expectations/compatibility/postgresql.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from great_expectations.compatibility.not_imported import NotImported + +POSTGRESQL_NOT_IMPORTED = NotImported( + "postgresql connection components are not installed, please 'pip install psycopg2'" +) + +try: + import psycopg2 # noqa: F401 + import sqlalchemy.dialects.postgresql as postgresqltypes +except ImportError: + postgresqltypes = POSTGRESQL_NOT_IMPORTED # type: ignore[assignment] + +try: + from sqlalchemy.dialects.postgresql import TEXT +except (ImportError, AttributeError): + TEXT = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import CHAR +except (ImportError, AttributeError): + CHAR = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import INTEGER +except (ImportError, AttributeError): + INTEGER = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import SMALLINT +except (ImportError, AttributeError): + SMALLINT = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import BIGINT +except (ImportError, AttributeError): + BIGINT = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import TIMESTAMP +except (ImportError, AttributeError): + TIMESTAMP = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import DATE +except (ImportError, AttributeError): + DATE = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION +except (ImportError, AttributeError): + DOUBLE_PRECISION = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import BOOLEAN +except (ImportError, AttributeError): + BOOLEAN = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + +try: + from sqlalchemy.dialects.postgresql import NUMERIC +except (ImportError, AttributeError): + NUMERIC = POSTGRESQL_NOT_IMPORTED # type: ignore[misc, assignment] + + +class POSTGRESQL_TYPES: + """Namespace for PostgreSQL dialect types.""" + + TEXT = TEXT + CHAR = CHAR + INTEGER = INTEGER + SMALLINT = SMALLINT + BIGINT = BIGINT + TIMESTAMP = TIMESTAMP + DATE = DATE + DOUBLE_PRECISION = DOUBLE_PRECISION + BOOLEAN = BOOLEAN + NUMERIC = NUMERIC diff --git a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py index e0bd10f36942..0c582ec8ed2b 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_in_type_list.py @@ -458,7 +458,11 @@ def _validate_pandas( # noqa: C901, PLR0912 def _validate_sqlalchemy(self, actual_column_type, expected_types_list, execution_engine): if expected_types_list is None: success = True - elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + elif execution_engine.dialect_name in [ + GXSqlDialect.DATABRICKS, + GXSqlDialect.POSTGRESQL, + GXSqlDialect.SNOWFLAKE, + ]: success = isinstance(actual_column_type, str) and any( actual_column_type.lower() == expected_type.lower() for expected_type in expected_types_list diff --git a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py index 647053760d6d..adcff692b3de 100644 --- a/great_expectations/expectations/core/expect_column_values_to_be_of_type.py +++ b/great_expectations/expectations/core/expect_column_values_to_be_of_type.py @@ -412,7 +412,11 @@ def _validate_sqlalchemy(self, actual_column_type, expected_type, execution_engi if expected_type is None: success = True - elif execution_engine.dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + elif execution_engine.dialect_name in [ + GXSqlDialect.DATABRICKS, + GXSqlDialect.POSTGRESQL, + GXSqlDialect.SNOWFLAKE, + ]: success = ( isinstance(actual_column_type, str) and actual_column_type.lower() == expected_type.lower() diff --git a/great_expectations/expectations/metrics/util.py b/great_expectations/expectations/metrics/util.py index c89f2e870477..8fe5e0d3f141 100644 --- a/great_expectations/expectations/metrics/util.py +++ b/great_expectations/expectations/metrics/util.py @@ -360,7 +360,7 @@ def __getitem__(self, key: Any) -> Any: return item -def get_sqlalchemy_column_metadata( # noqa: C901 +def get_sqlalchemy_column_metadata( # noqa: C901, PLR0912 execution_engine: SqlAlchemyExecutionEngine, table_selectable: sqlalchemy.Select, schema_name: Optional[str] = None, @@ -414,11 +414,18 @@ def get_sqlalchemy_column_metadata( # noqa: C901 ) dialect_name = execution_engine.dialect.name - if dialect_name in [GXSqlDialect.SNOWFLAKE, GXSqlDialect.DATABRICKS]: + if dialect_name in [ + GXSqlDialect.DATABRICKS, + GXSqlDialect.POSTGRESQL, + GXSqlDialect.SNOWFLAKE, + ]: # WARNING: Do not alter columns in place, as they are cached on the inspector columns_copy = [column.copy() for column in columns] for column in columns_copy: - column["type"] = column["type"].compile(dialect=execution_engine.dialect) + if column.get("type"): + # When using column_reflection_fallback, we might not be able to + # extract the column type, and only have the column name + column["type"] = column["type"].compile(dialect=execution_engine.dialect) if dialect_name == GXSqlDialect.SNOWFLAKE: return [ # TODO: SmartColumn should know the dialect and do lookups based on that diff --git a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py index 69bbbb675dc2..ce5629ded769 100644 --- a/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py +++ b/tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_in_type_list.py @@ -5,6 +5,7 @@ import great_expectations.expectations as gxe from great_expectations.compatibility.databricks import DATABRICKS_TYPES +from great_expectations.compatibility.postgresql import POSTGRESQL_TYPES from great_expectations.compatibility.snowflake import SNOWFLAKE_TYPES from great_expectations.compatibility.sqlalchemy import ( sqlalchemy as sa, @@ -19,6 +20,7 @@ from tests.integration.test_utils.data_source_config import ( DatabricksDatasourceTestConfig, PandasDataFrameDatasourceTestConfig, + PostgreSQLDatasourceTestConfig, SnowflakeDatasourceTestConfig, ) @@ -386,6 +388,110 @@ def test_success_complete_snowflake( assert result_dict["observed_value"] in expectation.type_list +@pytest.mark.parametrize( + "expectation", + [ + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="CHAR", type_list=["CHAR", "CHAR(1)"]), + id="CHAR", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="TEXT", type_list=["TEXT"]), + id="TEXT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="INTEGER", type_list=["INTEGER"]), + id="INTEGER", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="SMALLINT", type_list=["SMALLINT"]), + id="SMALLINT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="BIGINT", type_list=["BIGINT"]), + id="BIGINT", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="TIMESTAMP", type_list=["TIMESTAMP", "TIMESTAMP WITHOUT TIME ZONE"] + ), + id="TIMESTAMP", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="DATE", type_list=["DATE"]), + id="DATE", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList( + column="DOUBLE_PRECISION", type_list=["DOUBLE PRECISION"] + ), + id="DOUBLE_PRECISION", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="BOOLEAN", type_list=["BOOLEAN"]), + id="BOOLEAN", + ), + pytest.param( + gxe.ExpectColumnValuesToBeInTypeList(column="NUMERIC", type_list=["NUMERIC"]), + id="NUMERIC", + ), + ], +) +@parameterize_batch_for_data_sources( + data_source_configs=[ + PostgreSQLDatasourceTestConfig( + column_types={ + "CHAR": POSTGRESQL_TYPES.CHAR, + "TEXT": POSTGRESQL_TYPES.TEXT, + "INTEGER": POSTGRESQL_TYPES.INTEGER, + "SMALLINT": POSTGRESQL_TYPES.SMALLINT, + "BIGINT": POSTGRESQL_TYPES.BIGINT, + "TIMESTAMP": POSTGRESQL_TYPES.TIMESTAMP, + "DATE": POSTGRESQL_TYPES.DATE, + "DOUBLE_PRECISION": POSTGRESQL_TYPES.DOUBLE_PRECISION, + "BOOLEAN": POSTGRESQL_TYPES.BOOLEAN, + "NUMERIC": POSTGRESQL_TYPES.NUMERIC, + } + ), + ], + data=pd.DataFrame( + { + "CHAR": ["a", "b", "c"], + "TEXT": ["a", "b", "c"], + "INTEGER": [1, 2, 3], + "SMALLINT": [1, 2, 3], + "BIGINT": [1, 2, 3], + "TIMESTAMP": [ + "2021-01-01 00:00:00", + "2021-01-02 00:00:00", + "2021-01-03 00:00:00", + ], + "DATE": [ + # Date in isoformat + "2021-01-01", + "2021-01-02", + "2021-01-03", + ], + "DOUBLE_PRECISION": [1.0, 2.0, 3.0], + "BOOLEAN": [False, False, True], + "NUMERIC": [1, 2, 3], + }, + dtype="object", + ), +) +def test_success_complete_postgres( + batch_for_datasource: Batch, expectation: gxe.ExpectColumnValuesToBeInTypeList +) -> None: + result = batch_for_datasource.validate(expectation, result_format=ResultFormat.COMPLETE) + result_dict = result.to_json_dict()["result"] + + assert result.success + assert isinstance(result_dict, dict) + assert isinstance(result_dict["observed_value"], str) + assert isinstance(expectation.type_list, list) + assert result_dict["observed_value"] in expectation.type_list + + @pytest.mark.parametrize( "expectation", [