diff --git a/docs/docusaurus/docs/components/_testing/test_data_sets/titantic_test_file/folder_with_data/titantic.csv b/docs/docusaurus/docs/components/_testing/test_data_sets/titantic_test_file/folder_with_data/titantic.csv new file mode 100644 index 000000000000..cf587e5bfea0 --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_sets/titantic_test_file/folder_with_data/titantic.csv @@ -0,0 +1,22 @@ +"","Name","PClass","Age","Sex","Survived","SexCode" +"1","Allen, Miss Elisabeth Walton","1st",29,"female",1,1 +"2","Allison, Miss Helen Loraine","1st",2,"female",0,1 +"3","Allison, Mr Hudson Joshua Creighton","1st",30,"male",0,0 +"4","Allison, Mrs Hudson JC (Bessie Waldo Daniels)","1st",25,"female",0,1 +"5","Allison, Master Hudson Trevor","1st",0.92,"male",1,0 +"6","Anderson, Mr Harry","1st",47,"male",1,0 +"7","Andrews, Miss Kornelia Theodosia","1st",63,"female",1,1 +"8","Andrews, Mr Thomas, jr","1st",39,"male",0,0 +"358","Caldwell, Mrs Albert Francis (Sylvia Mae Harbaugh)","2nd",26,"female",1,1 +"359","Caldwell, Master Alden Gates","2nd",0.83,"male",1,0 +"360","Cameron, Miss Clear","2nd",31,"female",1,1 +"361","Campbell, Mr William","2nd",NA,"male",0,0 +"362","Carbines, Mr William","2nd",19,"male",0,0 +"363","Carter, Rev Ernest Courtenay","2nd",54,"male",0,0 +"364","Carter, Mrs Ernest Courtenay (Lillian Hughes)","2nd",44,"female",0,1 +"365","Chapman, Mr Charles Henry","2nd",52,"male",0,0 +"366","Chapman, Mr John Henry","2nd",30,"male",0,0 +"367","Chapman, Mrs John Henry (Elizabeth Lawry)","2nd",30,"female",0,1 +"368","Christy, Mrs Alice Frances","2nd",NA,"female",1,1 +"369","Christy, Miss Julie","2nd",NA,"female",1,1 + diff --git a/docs/docusaurus/docs/components/examples_under_test.py b/docs/docusaurus/docs/components/examples_under_test.py index 4255b764d59d..95cfad06801f 100644 --- a/docs/docusaurus/docs/components/examples_under_test.py +++ b/docs/docusaurus/docs/components/examples_under_test.py @@ -458,6 +458,36 @@ ), ] +docs_examples_customize_expectations = [ + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "docs_example_define_a_custom_expectation_class" tests/integration/test_script_runner.py + name="docs_example_define_a_custom_expectation_class", + user_flow_script="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py", + data_dir="docs/docusaurus/docs/components/_testing/test_data_sets/single_test_file", + # data_context_dir="", + backend_dependencies=[], + ), + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "docs_example_expectation_row_conditions" tests/integration/test_script_runner.py + name="docs_example_expectation_row_conditions", + user_flow_script="docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py", + data_dir="docs/docusaurus/docs/components/_testing/test_data_sets/titantic_test_file", + # data_context_dir="", + backend_dependencies=[], + ), + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "docs_example_use_sql_to_define_a_custom_expectation" tests/integration/test_script_runner.py + name="docs_example_use_sql_to_define_a_custom_expectation", + user_flow_script="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py", + data_dir="tests/test_sets/taxi_yellow_tripdata_samples/sqlite", + # data_context_dir="", + backend_dependencies=[], + ), +] + # Extend the docs_tests list with the above sublists (only the docs_tests list is imported # into `test_script_runner.py` and actually used in CI checks). @@ -473,6 +503,8 @@ docs_tests.extend(example_scripts_for_define_expectations) +docs_tests.extend(docs_examples_customize_expectations) + docs_tests.extend(docs_examples_trigger_actions_based_on_validation_results) docs_tests.extend(learn_data_quality_use_cases) diff --git a/docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py b/docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py new file mode 100644 index 000000000000..587390951dc2 --- /dev/null +++ b/docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py @@ -0,0 +1,74 @@ +""" +This is an example script for how to define a Custom Expectation class. + +To test, run: +pytest --docs-tests -k "docs_example_define_a_custom_expectation_class" tests/integration/test_script_runner.py +""" + + +def set_up_context_for_example(context): + # Create the Data Source + source_folder = "./data/folder_with_data" + data_source_name = "my_data_source" + data_source = context.data_sources.add_pandas_filesystem( + name=data_source_name, base_directory=source_folder + ) + assert data_source.name == data_source_name + + # Add a Data Asset + asset_name = "my_data_asset" + data_asset = data_source.add_csv_asset(name=asset_name) + assert data_asset.name == asset_name + + # Add a Batch Definition + batch_definition_name = "my_batch_definition" + batch_definition_regex = ( + r"yellow_tripdata_sample_(?P\d{4})-(?P\d{2})\.csv" + ) + batch_definition = data_asset.add_batch_definition_monthly( + name=batch_definition_name, regex=batch_definition_regex + ) + assert batch_definition.name == batch_definition_name + + +# EXAMPLE SCRIPT STARTS HERE: +# +import great_expectations as gx + +context = gx.get_context() +# Hide this +set_up_context_for_example(context) + + +# +# +# +class ExpectValidPassengerCount(gx.expectations.ExpectColumnValuesToBeBetween): + # + column: str = "passenger_count" + min_value: int = 1 + max_value: int = 6 + # + description: str = "There should be between **1** and **6** passengers." + + +# + +# Create an instance of the custom Expectation +# +expectation = ExpectValidPassengerCount() # Uses the predefined default values +# + +# Optional. Test the Expectation with some sample data +data_source_name = "my_data_source" +asset_name = "my_data_asset" +batch_definition_name = "my_batch_definition" +batch = ( + context.data_sources.get(data_source_name) + .get_asset(asset_name) + .get_batch_definition(batch_definition_name) + .get_batch() +) + +print(batch.validate(expectation)) +# diff --git a/docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py b/docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py new file mode 100644 index 000000000000..b5d32ab6025b --- /dev/null +++ b/docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py @@ -0,0 +1,86 @@ +""" +This is an example script for how to use Expectation row conditions. + +To test, run: +pytest --docs-tests -k "doc_example_expectation_row_conditions" tests/integration/test_script_runner.py +""" + + +def set_up_context_for_example(context): + # Create the Data Source + source_folder = "./data/folder_with_data" + data_source_name = "my_data_source" + data_source = context.data_sources.add_pandas_filesystem( + name=data_source_name, base_directory=source_folder + ) + assert data_source.name == data_source_name + + # Add a Data Asset + asset_name = "my_data_asset" + data_asset = data_source.add_csv_asset(name=asset_name) + assert data_asset.name == asset_name + + # Add a Batch Definition + batch_definition_name = "titantic_passengers" + batch_definition_path = "titantic.csv" + + batch_definition = data_asset.add_batch_definition_path( + name=batch_definition_name, path=batch_definition_path + ) + assert batch_definition.name == batch_definition_name + + +# EXAMPLE SCRIPT STARTS HERE: +# +import great_expectations as gx + +context = gx.get_context() +# Hide this +set_up_context_for_example(context) + +# Get a Batch for testing the Expectations: +data_source_name = "my_data_source" +data_asset_name = "my_data_asset" +batch_definition_name = "titantic_passengers" +batch = ( + context.data_sources.get(data_source_name) + .get_asset(data_asset_name) + .get_batch_definition(batch_definition_name) + .get_batch() +) + +# An unconditional Expectation is defined without the `row_condition` or `condition_parser` parameters: +# +expectation = gx.expectations.ExpectColumnValuesToBeInSet( + column="Survived", value_set=[0, 1] +) +# + +# Test the Expectation: +print(batch.validate(expectation)) + +# A Conditional Expectation for a pandas Data Source would be defined like this: +# +conditional_expectation = gx.expectations.ExpectColumnValuesToBeInSet( + column="Survived", + value_set=[1], + # + condition_parser="pandas", + row_condition='PClass=="1st"', + # +) +# + +# Test the Conditional Expectation: +print(batch.validate(conditional_expectation)) + +# A Conditional Expectation for a Spark or SQL Data Source would be defined like this: +conditional_expectation = gx.expectations.ExpectColumnValuesToBeInSet( + column="Survived", + value_set=[1], + # + condition_parser="spark", + row_condition='PClass=="1st"', + # +) +# diff --git a/docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py b/docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py new file mode 100644 index 000000000000..bff7f31826f0 --- /dev/null +++ b/docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py @@ -0,0 +1,77 @@ +""" +This is an example script for how to use SQL to define a custom Expectation. + +To test, run: +pytest --docs-tests -k "docs_example_use_sql_to_define_a_custom_expectation" tests/integration/test_script_runner.py +""" + + +def set_up_context_for_example(context): + # Create the Data Source + connection_string = "sqlite:///data/yellow_tripdata.db" + data_source_name = "my_sql_data_source" + data_source = context.data_sources.add_sqlite( + name=data_source_name, connection_string=connection_string + ) + assert data_source.name == data_source_name + + # Add a Data Asset + asset_name = "my_data_asset" + database_table_name = "yellow_tripdata_sample_2019_01" + data_asset = data_source.add_table_asset( + table_name=database_table_name, name=asset_name + ) + assert data_asset.name == asset_name + + # Add a Batch Definition + batch_definition_name = "my_batch_definition" + batch_definition = data_asset.add_batch_definition_whole_table( + batch_definition_name + ) + assert batch_definition.name == batch_definition_name + + +# EXAMPLE SCRIPT STARTS HERE: +# +import great_expectations as gx + + +# Define a custom Expectation that uses SQL by subclassing UnexpectedRowsExpectation +# +# +# +class ExpectPassengerCountToBeLegal( + gx.expectations.expectation.UnexpectedRowsExpectation +): + # + unexpected_rows_query: str = ( + "SELECT * FROM {batch} WHERE passenger_count > 6 or passenger_count < 0" + ) + # + description: str = "There should be no more than **6** passengers." + + +# + +context = gx.get_context() +# Hide this +set_up_context_for_example(context) + +# Instantiate the custom Expectation +# +expectation = ExpectPassengerCountToBeLegal() +# + +# Test the Expectation +data_source_name = "my_sql_data_source" +data_asset_name = "my_data_asset" +batch_definition_name = "my_batch_definition" +batch = ( + context.get_datasource(data_source_name) + .get_asset(data_asset_name) + .get_batch_definition(batch_definition_name) + .get_batch() +) + +batch.validate(expectation) +# diff --git a/docs/docusaurus/docs/core/customize_expectations/define_a_custom_expectation_class.md b/docs/docusaurus/docs/core/customize_expectations/define_a_custom_expectation_class.md index 083cfdc889da..a50052ef9759 100644 --- a/docs/docusaurus/docs/core/customize_expectations/define_a_custom_expectation_class.md +++ b/docs/docusaurus/docs/core/customize_expectations/define_a_custom_expectation_class.md @@ -28,23 +28,17 @@ Advantages of subclassing an Expectation and providing customized attributes rat -1. Choose and import a base Expectation class. +1. Choose a base Expectation class. You can customize any of the core Expectation classes in GX. You can view the available Expectations and their functionality in the [Expectation Gallery](https://greatexpectations.io/expectations). - In this example, `ExpectColumnValueToBeBetween` will be customized: - - ```python title="Python" - from great_expectations.expectations import ExpectColumnValueToBeBetween - ``` - + In this example, `ExpectColumnValuesToBeBetween` will be customized. 2. Create a new Expectation class that inherits the base Expectation class. The core Expectations in GX have names describing their functionality. When you create a customized Expectation class you can provide a class name that is more indicative of your specific use case: - ```python title="Python" - class ExpectValidPassengerCount(ExpectColumnValueToBeBetween): + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py - define a custom Expectation subclass" ``` 3. Override the Expectation's attributes with new default values. @@ -53,67 +47,30 @@ Advantages of subclassing an Expectation and providing customized attributes rat In this example, the default column for `ExpectValidPassengerCount` is set to `passenger_count` and the default value range for the column is defined as between `1` and `6`: - ```python title="Python" - class ExpectValidPassengerCount(ExpectColumnValueToBeBetween): - # highlight-start - column: str = "passenger_count" - min_value: int = 1 - max_value: int = 6 - # highlight-end + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py - define default attributes for a custom Expectation class" ``` -5. Customize the rendering of the new Expectation when displayed in Data Docs. +4. Customize the rendering of the new Expectation when displayed in Data Docs. - The `description` attribute contains the text describing the customized Expectation when your results are rendered into Data Docs. It can be set when an Expectation class is defined or edited as an attribute of an Expectation instance. You can format the `description` string with Markdown syntax: + The `description` attribute of a customized Expectation class contains the text describing the customized Expectation when its results are rendered into Data Docs. You can format the `description` string with Markdown syntax: - ```python title="Python" - class ExpectValidPassengerCount(ExpectColumnValueToBeBetween): - column: str = "passenger_count" - min_value: int = 1 - max_value: int = 6 - # highlight-start - description: str = "There should be between **1** and **6** passengers." - # highlight-end + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py - define description attribute for a cusom Expectation" ``` -6. Use the customized subclass as an Expectation. +5. Use the customized subclass as an Expectation. - Once a customized Expectation subclass has been defined, instances of it can be created, added to Expectation Suites, and validated just like any other Expectation class: + It is best not to overwrite the predefined default values by passing in parameters when a customized Expectation is created. This ensures that the `description` remains accurate to the values that the customized Expectation uses. It also allows you to update all instances of the customized Expectation by editing the default values in the customized Expectation's class definition rather than having to update each instance individually in their Expectation Suites: - ```python title="Python" - expectation1 = ExpectValidPassengerCount() # Uses the predefined default values - expectation2 = ExpectValidPassengerCount(column="occupied_seats") # Uses a different column than the default, but keeps the default min_value, max_value, and description. + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py - instantiate a Custom Expectation" ``` - - It is best to use the predefined default values when a customized Expectation is created. This ensures that the `description` remains accurate to the values that the Expectation uses. It also allows you to update all instances of the customized Expectation by editing the default values in the customized Expectation's class definition rather than having to update each instance individually in their Expectation Suites. + + A customized Expectation instance can be added to Expectation Suites and validated just like any other Expectation. -```python title="Python" -import great_expectations as gx -from great_expectations.expectations import ExpectColumnValueToBeBetween - -class ExpectValidPassengerCount(ExpectColumnValueToBeBetween): - column: str = "passenger_count" - min_value: int = 1 - max_value: int = 6 - description: str = "There should be between **1** and **6** passengers." - -context = gx.get_context() - -expectation1 = ExpectValidPassengerCount() # Uses the predefined default values -expectation2 = ExpectValidPassengerCount(column="occupied_seats") # Uses a different column than the default, but keeps the default min_value, max_value, and description. - -data_source_name = "my_taxi_data" -asset_name = "2018_taxi_data" -batch_definition_name = "all_records_in_asset" -batch = context.get_datasource(datasource_name).get_asset(asset_name).get_batch_definition(batch_definition_name=batch_definition_name).get_batch() - -batch.validate(expectation1) -batch.validate(expectation2) - +```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/define_a_custom_expectation_class.py - full code example" ``` diff --git a/docs/docusaurus/docs/core/customize_expectations/expectation_row_conditions.md b/docs/docusaurus/docs/core/customize_expectations/expectation_row_conditions.md index ef21874893fc..4dc5c81cb05d 100644 --- a/docs/docusaurus/docs/core/customize_expectations/expectation_row_conditions.md +++ b/docs/docusaurus/docs/core/customize_expectations/expectation_row_conditions.md @@ -75,16 +75,14 @@ The data used in the examples for this procedure is passenger data for the Titan - ```python title="Python" - row_condition = "PClass=='1st'" + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py - pandas example row_condition" ``` - ```python title="Python" - row_condition = "PClass=='1st'" + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py - spark example row_condition" ``` @@ -150,24 +148,14 @@ The data used in the examples for this procedure is passenger data for the Titan Expectations with different conditions are treated as unique even if they are of the same type and apply to the same column within an Expectation Suite. This allows you to create one unconditional Expectation and an arbitrary number of Conditional Expectations (each with a different condition). - For example, the following code creates a unconditional Expectation that the value of the `"Suvived"` column is either 0 or 1, and a Conditional Expectation that the value of the `"Survived"` column is `1` if the individual was a first class passenger: + For example, the following code creates a unconditional Expectation that the value of the `"Survived"` column is either 0 or 1: - ```python title="Python" - expectation = suite.add_expectation( - gxe.ExpectColumnValuesToBeInSet( - column="Survived", - value_set=[0, 1] - ) - ) + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py - example unconditional Expectation" + ``` - conditional_expectation = suite.add_expectation( - gxe.ExpectColumnValuesToBeInSet( - column='Survived', - value_set=[1], - condition_parser='pandas', - row_condition='PClass=="1st"' - ) - ) + And this code creates a Conditional version of the same Expectation that specifies the value of the `"Survived"` column is `1` if the individual was a first class passenger: + + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/expectation_row_conditions.py - example conditional Expectation" ``` diff --git a/docs/docusaurus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md b/docs/docusaurus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md index 49175bdc771c..80f02e4d1b0a 100644 --- a/docs/docusaurus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md +++ b/docs/docusaurus/docs/core/customize_expectations/use_sql_to_define_a_custom_expectation.md @@ -28,79 +28,43 @@ You customize an `UnexpectedRowsExpectation` in essentially the same manner as y -1. Import the `UnexpectedRowsExpectation` class: - - ```python title="Python" - from great_expectations.expectations import UnexpectedRowsExpectation - ``` - -2. Create a new Expectation class that inherits the `UnexpectedRowsExpectation` class. +1. Create a new Expectation class that inherits the `UnexpectedRowsExpectation` class. The class name `UnexpectedRowsExpectation` describes the functionality of the Expectation: it finds rows with unexpected values. When you create a customized Expectation class you can provide a class name that is more indicative of your specific use case. In this example, the customized subclass of `UnexpectedRowsExpectation` will be used to find invalid passenger counts in taxi trip data: - ```python title="Python" - class ExpectPassengerCountToBeLegal(UnexpectedRowsExpectation): + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py - define a more descriptive name for an UnexpectedRowsExpectation" ``` -3. Override the Expectation's `unexpected_rows_query` attribute. +2. Override the Expectation's `unexpected_rows_query` attribute. The `unexpected_rows_query` attribute is a SQL or Spark-SQL query that returns a selection of rows from the Batch of data being validated. By default, rows that are returned have failed the validation check. Although the `unexpected_rows_query` should be written in standard SQL or Spark-SQL syntax, it must also contain the special `{batch}` placeholder. When the Expectation is evaluated, the `{batch}` placeholder will be replaced with the Batch of data that is validated. - In this example, `unexpected_rows_query` will select any rows where the passenger count is greater than `6`. These rows will fail validation for this Expectation: + In this example, `unexpected_rows_query` will select any rows where the passenger count is greater than `6` or less than `0`. These rows will fail validation for this Expectation: - ```python title="Python" - class ExpectPassengerCountToBeLegal(UnexpectedRowsExpectation): - # highlight-start - unexpected_rows_query: str = "SELECT * FROM {batch} WHERE passenger_count > 6" - # highlight-end + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py - define the query for an UnexpectedRowsExpectation" ``` -5. Customize the rendering of the new Expectation when displayed in Data Docs. +3. Customize the rendering of the new Expectation when displayed in Data Docs. As with other Expectations, the `description` attribute contains the text describing the customized Expectation when your results are rendered into Data Docs. It can be set when an Expectation class is defined or edited as an attribute of an Expectation instance. You can format the `description` string with Markdown syntax: - ```python title="Python" - class ExpectPassengerCountToBeLegal(UnexpectedRowsExpectation): - column: str = "passenger_count" - unexpected_rows_query: str = "SELECT * FROM {batch} WHERE passenger_count > 6" - # highlight-start - description: str = "There should be no more than **6** passengers." - # highlight-end + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py - define a custom UnexpectedRowsExpectation" ``` -6. Use the customized subclass as an Expectation. +4. Use the customized subclass as an Expectation. Once the customized Expectation subclass has been defined, instances of it can be created, added to Expectation Suites, and validated just like any other Expectation class: - ```python title="Python" - expectation = ExpectPassengerCountToBeLegal() + ```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py - instantiate the custom SQL Expectation" ``` -```python title="Python" -import great_expectations as gx -from great_expectations.expectations import UnexpectedRowsExpectation - -class ExpectPassengerCountToBeLegal(UnexpectedRowsExpectation): - unexpected_rows_query: str = "SELECT * FROM {batch} WHERE passenger_count > 6" - description: str = "There should be no more than **6** passengers." - -context = gx.get_context() - -expectation = ExpectPassengerCountToBeLegal() # Uses the predefined default values - -data_source_name = "my_taxi_data" -asset_name = "2018_taxi_data" -batch_definition_name = "all_records_in_asset" -batch = context.get_datasource(datasource_name).get_asset(asset_name).get_batch_definition(batch_definition_name=batch_definition_name).get_batch() - -batch.validate(expectation) - +```python title="Python" name="docs/docusaurus/docs/core/customize_expectations/_examples/use_sql_to_define_a_custom_expectation.py - full code example" ```