From 8fdfd6b8454c5f592687746286a3a8c643de0174 Mon Sep 17 00:00:00 2001 From: Paul Lam Date: Mon, 9 Sep 2024 11:08:54 +0900 Subject: [PATCH] moved code snippets into CI --- .../learn/data_quality_use_cases/volume.md | 47 ++----- .../volume_resources/volume_expectations.py | 116 ++++++++++++++++++ 2 files changed, 123 insertions(+), 40 deletions(-) create mode 100644 docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py diff --git a/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume.md b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume.md index eeb4e71e02f9..474c36362d76 100644 --- a/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume.md +++ b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume.md @@ -46,11 +46,7 @@ Ensures that the number of rows in a dataset falls within a specified range. **Use Case**: Validate that daily transaction volumes are within expected bounds, alerting to unusual spikes or drops in activity. -```py -gxe.ExpectTableRowCountToBeBetween( - min_value=1000 - max_value=1500 -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py ExpectTableRowCountToBeBetween" ``` View `ExpectTableRowCountToBeBetween` in the [Expectation Gallery](https://greatexpectations.io/expectations/expect_table_row_count_to_be_between). @@ -62,10 +58,7 @@ Verifies that the dataset contains exactly the specified number of records. **Use Case**: Ensure that a specific number of records are processed, useful for batch operations or reconciliation tasks. -```py -gxe.ExpectTableRowCountToEqual( - value=300 -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py ExpectTableRowCountToEqual" ``` View `ExpectTableRowCountToEqual` in the [Expectation Gallery](https://greatexpectations.io/expectations/expect_table_row_count_to_equal). @@ -77,10 +70,7 @@ Compares the row count of the current table to another table within the same dat **Use Case**: Verify data consistency across different stages of a pipeline or between source and target systems. -```py -gxe.ExpectTableRowCountToEqualOtherTable( - other_table_name="transactions_summary" -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py ExpectTableRowCountToEqualOtherTable" ``` View `ExpectTableRowCountToEqualOtherTable` in the [Expectation Gallery](https://greatexpectations.io/expectations/expect_table_row_count_to_equal_other_table). @@ -100,10 +90,7 @@ gxe.ExpectTableRowCountToEqualOtherTable( **GX solution**: Implement checks to ensure data volume consistency between source and target systems in a data reconciliation process. -```python -gxe.ExpectTableRowCountToEqualOtherTable( - other_table_name="target_system_transactions" -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py reconciliation_across_systems" ``` ### Monitoring data volume in real-time streaming pipelines @@ -112,11 +99,7 @@ gxe.ExpectTableRowCountToEqualOtherTable( **GX solution**: Implement checks to monitor data volume in real-time streaming pipelines and alert when anomalies are detected. -```python -gxe.ExpectTableRowCountToBeBetween( - min_value=1000, - max_value=1500 -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py monitoring_streaming_pipelines" ``` ### Batch processing verification @@ -125,10 +108,7 @@ gxe.ExpectTableRowCountToBeBetween( **GX solution**: Validate that each processed batch contains exactly the expected number of records. -```python -gxe.ExpectTableRowCountToEqual( - value=300 -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py batch_processing_verification" ``` ## Avoid common volume validation pitfalls @@ -153,20 +133,7 @@ While volume management is a critical component of data quality, it's just one f 3. Develop a multifaceted approach that combines volume checks with other [crucial data quality aspects](/reference/learn/data_quality_use_cases/dq_use_cases_lp.md), such as data integrity, schema evolution, and distribution analysis. For instance, consider coupling volume checks with schema validation: -```python -gxe.ExpectTableRowCountToBeBetween( - min_value=1000 - max_value=1500 -) - -gxe.ExpectTableColumnsToMatchOrderedList( - column_list=[ - "sender_account_number", - "recipient_account_number", - "transfer_amount", - "transfer_date", - ] -) +```python title="" name="docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py combined_checks" ``` This combination allows you to monitor for unexpected data growth while simultaneously ensuring structural consistency, providing a more robust validation framework. diff --git a/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py new file mode 100644 index 000000000000..161933ca1519 --- /dev/null +++ b/docs/docusaurus/docs/reference/learn/data_quality_use_cases/volume_resources/volume_expectations.py @@ -0,0 +1,116 @@ +""" +To run this test locally, use the postgresql database docker container. + +1. From the repo root dir, run: +cd assets/docker/postgresql +docker compose up + +2. Run the following command from the repo root dir in a second terminal: +pytest --postgresql --docs-tests -k "data_quality_use_case_missingness_expectations" tests/integration/test_script_runner.py +""" + +# This section loads sample data to use for CI testing of the script. +import pathlib + +import great_expectations as gx +import great_expectations.expectations as gxe +from tests.test_utils import load_data_into_test_database + +CONNECTION_STRING = "postgresql+psycopg2://postgres:@localhost/test_ci" + +GX_ROOT_DIR = pathlib.Path(gx.__file__).parent.parent + +# Add test data to database for testing. +load_data_into_test_database( + table_name="transfers", + csv_path=str( + GX_ROOT_DIR / "tests/test_sets/learn_data_quality_use_cases/volume_financial_transfers.csv" + ), + connection_string=CONNECTION_STRING, +) + +context = gx.get_context() + +datasource = context.data_sources.add_postgres( + "postgres database", connection_string=CONNECTION_STRING +) + +data_asset = datasource.add_table_asset(name="data asset", table_name="transfers") +batch_definition = data_asset.add_batch_definition_whole_table("batch definition") +batch = batch_definition.get_batch() + +suite = context.suites.add(gx.ExpectationSuite(name="example missingness expectations")) + +############################# +# Start Expectation snippets. + +suite.add_expectation( + # + gxe.ExpectTableRowCountToBeBetween( + min_value=2, + max_value=5 + ) + # +) + +suite.add_expectation( + # + gxe.ExpectTableRowCountToEqual( + value=4 + ) + # +) + +suite.add_expectation( + # + gxe.ExpectTableRowCountToEqualOtherTable( + other_table_name="transactions_summary" + ) + # +) + +suite.add_expectation( + # + gxe.ExpectTableRowCountToEqualOtherTable( + other_table_name="target_system_transactions" + ) + # +) + +suite.add_expectation( + # + gxe.ExpectTableRowCountToBeBetween( + min_value=2, + max_value=5 + ) + # +) + +suite.add_expectation( + # + gxe.ExpectTableRowCountToEqual( + value=4 + ) + # +) + +# TODO fix this +# suite.add_expectation( +# # +# gxe.ExpectTableRowCountToBeBetween( +# min_value=2, +# max_value=5 +# ), +# +# gxe.ExpectTableColumnsToMatchOrderedList( +# column_list=[ +# "sender_account_number", +# "recipient_account_number", +# "transfer_amount", +# "transfer_date", +# ] +# ) +# # +# ) + +results = batch.validate(suite)