diff --git a/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/.gitignore b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/.gitignore new file mode 100644 index 000000000000..f34131a74439 --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/.gitignore @@ -0,0 +1,2 @@ + +uncommitted/ \ No newline at end of file diff --git a/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/expectations/.ge_store_backend_id b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/expectations/.ge_store_backend_id new file mode 100644 index 000000000000..c3d3c26efac6 --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/expectations/.ge_store_backend_id @@ -0,0 +1 @@ +store_backend_id = ed62c7bd-023a-4b27-bd49-e1f9bd825c43 diff --git a/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/great_expectations.yml b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/great_expectations.yml new file mode 100644 index 000000000000..c70f52d609bd --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/great_expectations.yml @@ -0,0 +1,91 @@ +# Welcome to Great Expectations! Always know what to expect from your data. +# +# Here you can define datasources, batch kwargs generators, integrations and +# more. This file is intended to be committed to your repo. For help with +# configuration please: +# - Read our docs: https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview/#2-configure-your-datasource +# - Join our slack channel: http://greatexpectations.io/slack + +# config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility +# It is auto-generated and usually does not need to be changed. +config_version: 4.0 + +# This config file supports variable substitution which enables: 1) keeping +# secrets out of source control & 2) environment-based configuration changes +# such as staging vs prod. +# +# When GX encounters substitution syntax (like `my_key: ${my_value}` or +# `my_key: $my_value`) in the great_expectations.yml file, it will attempt +# to replace the value of `my_key` with the value from an environment +# variable `my_value` or a corresponding key read from this config file, +# which is defined through the `config_variables_file_path`. +# Environment variables take precedence over variables defined here. +# +# Substitution values defined here can be a simple (non-nested) value, +# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) +# +# +# https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials + +config_variables_file_path: uncommitted/config_variables.yml + +# The plugins_directory will be added to your python path for custom modules +# used to override and extend Great Expectations. +plugins_directory: plugins/ + +stores: + # Stores are configurable places to store things like Expectations, Validations + # Data Docs, and more. These are for advanced users only - most users can simply + # leave this section alone. + # + # Three stores are required: expectations, validations, and + # suite_parameters, and must exist with a valid store entry. Additional + # stores can be configured for uses such as data_docs, etc. + expectations_store: + class_name: ExpectationsStore + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: expectations/ + + validation_results_store: + class_name: ValidationResultsStore + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: uncommitted/validations/ + + checkpoint_store: + class_name: CheckpointStore + store_backend: + class_name: TupleFilesystemStoreBackend + suppress_store_backend_id: true + base_directory: checkpoints/ + + validation_definition_store: + class_name: ValidationDefinitionStore + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: validation_definitions/ + +expectations_store_name: expectations_store +validation_results_store_name: validation_results_store +checkpoint_store_name: checkpoint_store + +data_docs_sites: + # Data Docs make it simple to visualize data quality in your project. These + # include Expectations, Validations & Profiles. The are built for all + # Datasources from JSON artifacts in the local repo including validations & + # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs + local_site: + class_name: SiteBuilder + show_how_to_buttons: true + store_backend: + class_name: TupleFilesystemStoreBackend + base_directory: uncommitted/data_docs/local_site/ + site_index_builder: + class_name: DefaultSiteIndexBuilder + +analytics_enabled: true +fluent_datasources: + my_dataframe_data_source: + type: pandas +data_context_id: ed62c7bd-023a-4b27-bd49-e1f9bd825c43 diff --git a/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css new file mode 100644 index 000000000000..8bf5a15216a8 --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_contexts/dataframe_datasource_pandas/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css @@ -0,0 +1,22 @@ +/*index page*/ +.ge-index-page-site-name-title {} +.ge-index-page-table-container {} +.ge-index-page-table {} +.ge-index-page-table-profiling-links-header {} +.ge-index-page-table-expectations-links-header {} +.ge-index-page-table-validations-links-header {} +.ge-index-page-table-profiling-links-list {} +.ge-index-page-table-profiling-links-item {} +.ge-index-page-table-expectation-suite-link {} +.ge-index-page-table-validation-links-list {} +.ge-index-page-table-validation-links-item {} + +/*breadcrumbs*/ +.ge-breadcrumbs {} +.ge-breadcrumbs-item {} + +/*navigation sidebar*/ +.ge-navigation-sidebar-container {} +.ge-navigation-sidebar-content {} +.ge-navigation-sidebar-title {} +.ge-navigation-sidebar-link {} diff --git a/docs/docusaurus/docs/components/_testing/test_data_contexts/filesystem_datasource_aws_pandas_no_assets/gx/uncomitted/config_variables.yml b/docs/docusaurus/docs/components/_testing/test_data_contexts/filesystem_datasource_aws_pandas_no_assets/gx/uncomitted/config_variables.yml new file mode 100644 index 000000000000..a0399648fa3f --- /dev/null +++ b/docs/docusaurus/docs/components/_testing/test_data_contexts/filesystem_datasource_aws_pandas_no_assets/gx/uncomitted/config_variables.yml @@ -0,0 +1,18 @@ +# This config file supports variable substitution which enables: 1) keeping +# secrets out of source control & 2) environment-based configuration changes +# such as staging vs prod. +# +# When GX encounters substitution syntax (like `my_key: ${my_value}` or +# `my_key: $my_value`) in the great_expectations.yml file, it will attempt +# to replace the value of `my_key` with the value from an environment +# variable `my_value` or a corresponding key read from this config file, +# which is defined through the `config_variables_file_path`. +# Environment variables take precedence over variables defined here. +# +# Substitution values defined here can be a simple (non-nested) value, +# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) +# +# +# https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials + +instance_id: e0814d0f-1fce-48e0-8609-33f412321338 diff --git a/docs/docusaurus/docs/components/examples_under_test.py b/docs/docusaurus/docs/components/examples_under_test.py index 27c911ffbd80..eb7d577c47b8 100644 --- a/docs/docusaurus/docs/components/examples_under_test.py +++ b/docs/docusaurus/docs/components/examples_under_test.py @@ -8,6 +8,47 @@ docs_tests = [] +connect_to_dataframe_data = [ + # Create a Data Source, pandas/spark + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests --spark -k "create_a_df_data_source_spark" tests/integration/test_script_runner.py + name="create_a_df_data_source_spark", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py", + # data_dir="", + # data_context_dir="", + backend_dependencies=[BackendDependencies.SPARK], + ), + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "create_a_df_data_source_pandas" tests/integration/test_script_runner.py + name="create_a_df_data_source_pandas", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py", + # data_dir="", + # data_context_dir="", + backend_dependencies=[], + ), + # Create a Data Asset, pandas + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "create_a_df_data_asset_pandas" tests/integration/test_script_runner.py + name="create_a_df_data_asset_pandas", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py", + data_dir="docs/docusaurus/docs/components/_testing/test_data_sets/single_test_file", + # data_context_dir="", + backend_dependencies=[], + ), + # Create a Batch Definition, pandas + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "create_a_df_batch_definition_pandas" tests/integration/test_script_runner.py + name="create_a_df_batch_definition_pandas", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py", + # data_context_dir="", + backend_dependencies=[], + ), +] + docs_example_scripts_run_validations = [ # Create a Validation Definition IntegrationTestFixture( @@ -19,6 +60,25 @@ # data_context_dir="", backend_dependencies=[], ), + # Batch Parameters, for a Batch Definition/for a Validation Definition + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests --spark -k "df_batch_parameters_for_batch_definition" tests/integration/test_script_runner.py + name="df_batch_parameters_for_batch_definition", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py", + data_dir="docs/docusaurus/docs/components/_testing/test_data_sets/single_test_file", + # data_context_dir="", + backend_dependencies=[BackendDependencies.SPARK], + ), + IntegrationTestFixture( + # To test, run: + # pytest --docs-tests -k "df_batch_parameters_for_validation_definition" tests/integration/test_script_runner.py + name="df_batch_parameters_for_validation_definition", + user_flow_script="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_validation_definition.py", + data_dir="docs/docusaurus/docs/components/_testing/test_data_sets/single_test_file", + # data_context_dir="", + backend_dependencies=[], + ), # Run a Validation Definition IntegrationTestFixture( # To test, run: @@ -274,10 +334,12 @@ # Extend the docs_tests list with the above sublists (only the docs_tests list is imported # into `test_script_runner.py` and actually used in CI checks). -docs_tests.extend(docs_example_scripts_run_validations) docs_tests.extend(connect_to_filesystem_data_create_a_data_source) docs_tests.extend(connect_to_filesystem_data_create_a_data_asset) docs_tests.extend(connect_to_filesystem_data_create_a_batch_definition) +docs_tests.extend(connect_to_dataframe_data) + +docs_tests.extend(docs_example_scripts_run_validations) docs_tests.extend(learn_data_quality_use_cases) diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py new file mode 100644 index 000000000000..b013576d1069 --- /dev/null +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py @@ -0,0 +1,60 @@ +# Define Batch Parameters for a Spark dataframe +# +from pyspark.sql import SparkSession + +csv = "./data/folder_with_data/yellow_tripdata_sample_2019-01.csv" +spark = SparkSession.builder.appName("Read CSV").getOrCreate() +dataframe = spark.read.csv(csv, header=True, inferSchema=True) + +batch_parameters = {"dataframe": dataframe} +# + +# Define Batch Parameters for a pandas dataframe +# +import pandas + +csv_path = "./data/folder_with_data/yellow_tripdata_sample_2019-01.csv" +dataframe = pandas.read_csv(csv_path) + +# +batch_parameters = {"dataframe": dataframe} +# +# + + +def setup_context_for_example(context): + data_source = context.data_sources.add_pandas(name="my_data_source") + data_asset = data_source.add_dataframe_asset(name="my_dataframe_data_asset") + data_asset.add_batch_definition_whole_dataframe("my_batch_definition") + + +# +import great_expectations as gx + +context = gx.get_context() +# Hide this +setup_context_for_example(context) + +# Retrieve the dataframe Batch Definition +data_source_name = "my_data_source" +data_asset_name = "my_dataframe_data_asset" +batch_definition_name = "my_batch_definition" +batch_definition = ( + context.data_sources.get(data_source_name) + .get_asset(data_asset_name) + .get_batch_definition(batch_definition_name) +) + +# Create an Expectation to test +expectation = gx.expectations.ExpectColumnValuesToBeBetween( + column="passenger_count", max_value=6, min_value=1 +) + +# Get the dataframe as a Batch +# highlight-next-line +batch = batch_definition.get_batch(batch_parameters=batch_parameters) + +# Test the Expectation +validation_results = batch.validate(expectation) +print(validation_results) +# diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_validation_definition.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_validation_definition.py new file mode 100644 index 000000000000..75def31e1499 --- /dev/null +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_validation_definition.py @@ -0,0 +1,45 @@ +# Define Batch Parameters for a pandas dataframe +import pandas + +csv_path = "./data/folder_with_data/yellow_tripdata_sample_2019-01.csv" +dataframe = pandas.read_csv(csv_path) + +batch_parameters = {"dataframe": dataframe} + + +def set_up_context_for_example(context): + data_source = context.data_sources.add_pandas(name="my_data_source") + data_asset = data_source.add_dataframe_asset(name="my_dataframe_data_asset") + batch_definition = data_asset.add_batch_definition_whole_dataframe( + "my_batch_definition" + ) + + # Create an Expectation Suite + suite = context.suites.add(gx.ExpectationSuite(name="my_expectation_suite")) + # Add an Expectation to the Expectation Suite + suite.add_expectation( + gx.expectations.ExpectColumnValuesToNotBeNull(column="pickup_datetime") + ) + # Add a Validation Definition + context.validation_definitions.add( + gx.ValidationDefinition( + data=batch_definition, suite=suite, name="my_validation_definition" + ) + ) + + +# +import great_expectations as gx + +context = gx.get_context() +# Hide this +set_up_context_for_example(context) + +# Retrieve a Validation Definition that uses the dataframe Batch Definition +validation_definition_name = "my_validation_definition" +validation_definition = context.validation_definitions.get(validation_definition_name) + +# Validate the dataframe by passing it to the Validation Definition as Batch Parameters. +validation_results = validation_definition.run(batch_parameters=batch_parameters) +print(validation_results) +# diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py new file mode 100644 index 000000000000..4b753bcc17e3 --- /dev/null +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py @@ -0,0 +1,34 @@ +# +import great_expectations as gx + +context = gx.get_context() +# Hide this +assert type(context).__name__ == "EphemeralDataContext" +# Hide this +# SETUP FOR THE EXAMPLE: +# Hide this +data_source = context.data_sources.add_pandas(name="my_data_source") +# Hide this +data_asset = data_source.add_dataframe_asset(name="my_dataframe_data_asset") + +# Retrieve the Data Asset +# +data_source_name = "my_data_source" +data_asset_name = "my_dataframe_data_asset" +data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name) +# + +# Define the Batch Definition name +# +batch_definition_name = "my_batch_definition" +# + +# Add a Batch Definition to the Data Asset +# +batch_definition = data_asset.add_batch_definition_whole_dataframe( + batch_definition_name +) +# +# Hide this +assert batch_definition.name == batch_definition_name +# diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py index 5bae4b3c462a..3443600f0cfb 100644 --- a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py @@ -1,38 +1,30 @@ -# -import pandas - +# import great_expectations as gx -context = gx.get_context(mode="file") +context = gx.get_context() +# Hide this +assert type(context).__name__ == "EphemeralDataContext" +# Hide this +# SETUP FOR THE EXAMPLE: +# Hide this +data_source = context.data_sources.add_pandas(name="my_data_source") # Retrieve the Data Source -# +# data_source_name = "my_data_source" -data_source = context.data_sources.get(name=data_source_name) +data_source = context.data_sources.get(data_source_name) # -# -# Read in dataframe -csv_path = "data/sampled_yellow_tripdata_2019-01.csv" -dataframe = pandas.read_csv(csv_path) -# Define a name for the Data Asset -data_asset_name = "pandas_dataframe" +# Define the Data Asset name +# +data_asset_name = "my_dataframe_data_asset" # +# Add a Data Asset to the Data Source # -data_asset = data_source.add_dataframe_asset(name=data_asset_name, dataframe=dataframe) -# - -# Add a Batch Definition -# -batch_definition_name = "dataframe_batch" -batch_definition = data_asset.add_batch_definition(name=batch_definition_name) +data_asset = data_source.add_dataframe_asset(name=data_asset_name) # -# Verify the Batch Definition can read data -# and return records as a Batch -# -batch = batch_definition.get_batch() -print(batch.head()) -# +# Hide this +assert data_asset.name == data_asset_name # diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py index 918a07315e4b..b75db22e7ab4 100644 --- a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py @@ -1,13 +1,20 @@ # import great_expectations as gx -context = gx.get_context(mode="file") +# Retrieve your Data Context +context = gx.get_context() +# Hide this +assert type(context).__name__ == "EphemeralDataContext" -# Define the Data Source parameters +# Define the Data Source name +# data_source_name = "my_data_source" +# -# Create the Data Source -# +# Add the Data Source to the Data Context +# data_source = context.data_sources.add_pandas(name=data_source_name) +# Hide this +assert data_source.name == data_source_name # # diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py deleted file mode 100644 index 5c9e042c6b79..000000000000 --- a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py +++ /dev/null @@ -1,41 +0,0 @@ -# -from pyspark.sql import SparkSession - -import great_expectations as gx - -context = gx.get_context(mode="file") - -# Retrieve the Data Source -# -data_source_name = "my_data_source" -data_source = context.data_sources.get(name=data_source_name) -# - -# Add the dataframe as a Data Asset -# -# Read in the dataframe -csv = "data/sampled_yellow_tripdata_2019-01.csv" -spark = SparkSession.builder.appName("Read CSV").getOrCreate() -dataframe = spark.read.csv(csv, header=True, inferSchema=True) -# Define a name for the Data Asset -data_asset_name = "spark_dataframe" -# - -# Create the Data Asset -# -data_asset = data_source.add_dataframe_asset(name=data_asset_name, dataframe=dataframe) -# - -# Add a Batch Definition -# -batch_definition_name = "dataframe_batch" -batch_definition = data_asset.add_batch_definition(name=batch_definition_name) -# - -# Verify the Batch Definition can read data -# and return records as a Batch -# -batch = batch_definition.get_batch() -print(batch.head()) -# -# diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py index 01f2f3a85f39..9768206528a1 100644 --- a/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py @@ -1,15 +1,20 @@ # import great_expectations as gx -context = gx.get_context(mode="file") +# Retrieve your Data Context +context = gx.get_context() +# Hide this +assert type(context).__name__ == "EphemeralDataContext" -# Define the Data Source parameters -# +# Define the Data Source name +# data_source_name = "my_data_source" # -# Create the Data Source -# +# Add the Data Source to the Data Context +# data_source = context.data_sources.add_spark(name=data_source_name) +# Hide this +assert data_source.name == data_source_name # # diff --git a/docs/docusaurus/docs/core/connect_to_data/dataframes/dataframes.md b/docs/docusaurus/docs/core/connect_to_data/dataframes/dataframes.md index 863d305ff1cf..2de462795db9 100644 --- a/docs/docusaurus/docs/core/connect_to_data/dataframes/dataframes.md +++ b/docs/docusaurus/docs/core/connect_to_data/dataframes/dataframes.md @@ -25,12 +25,13 @@ Because the dataframes reside in memory you do not need to specify the location - - - Optional. . -- Data in a pandas or Spark dataframe. These examples assume the variable `dataframe` contains your pandas or Spark dataframe. - . These examples assume the variable `context` contains your Data Context. +### Procedure + - + 1. Define the Data Source parameters. @@ -40,7 +41,7 @@ Because the dataframes reside in memory you do not need to specify the location Update `data_source_name` in the following code with a descriptive name for your Data Source: - ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py - define Data Source parameters" + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py - define Data Source name" ``` 2. Create the Data Source. @@ -53,7 +54,7 @@ Because the dataframes reside in memory you do not need to specify the location Execute the following code to create a pandas Data Source: - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py - create Data Source" + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py Add Data Source" ``` @@ -62,7 +63,7 @@ Because the dataframes reside in memory you do not need to specify the location Execute the following code to create a Spark Data Source: - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py - create Data Source" + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py Add Data Source" ``` @@ -75,19 +76,19 @@ Because the dataframes reside in memory you do not need to specify the location - + - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py - full example" - ``` + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_source.py - full example" + ``` - + - + - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py - full example" - ``` + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_source.py - full example" + ``` - + @@ -95,32 +96,29 @@ Because the dataframes reside in memory you do not need to specify the location -## Create a Data Asset and Batch Definition - -To access data from your dataframe in GX you will connect to the dataframe with a Data Asset. Then you will define a Batch Definition with which the data can be retrieved. +## Create a Data Asset -Because dataframes exist in memory and cease to exist when the Python session ends a dataframe Data Asset will need to be created anew in every Python session that utilizes it. - -In a File Data Context previous dataframe Data Asset and Batch Definition configurations will persist between sessions. However, since the dataframe they connected to will not also persist between Python sessions those configurations will only be useable for reference purposes. +A dataframe Data Asset is used to group your Validation Results. For instance, if you have a data pipeline with three stages and you wanted the Validation Results for each stage to be grouped together, you would create a Data Asset with a unique name representing each stage. ### Prerequisites - - - Optional. . -- Data in a pandas or Spark dataframe. These examples assume the variable `dataframe` contains your pandas or Spark dataframe. - . These examples assume the variable `context` contains your Data Context. -- - A [pandas or Spark dataframe Data Source](#create-a-data-source). +- A [pandas or Spark dataframe Data Source](#create-a-data-source). + +### Procedure - + 1. Optional. Retrieve your Data Source. If you do not already have a variable referencing your pandas or Spark Data Source, you can retrieve a previously created one with: - ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py - retrieve Data Source" + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - retrieve Data Source" ``` 2. Define the Data Asset's parameters. @@ -128,71 +126,160 @@ In a File Data Context previous dataframe Data Asset and Batch Definition config A dataframe Data Asset requires the following information: - `name`: A name by which the Data Asset can be referenced. This should be unique among Data Assets on the Data Source. - - `dataframe`: The pandas or Spark dataframe that the Data Asset should retrieve data from. - The following examples create a dataframe by reading a `.csv` file and defines a name for the Data Asset: + Update the `data_asset_name` parameter in the following code with a descriptive name for your Data Asset: - + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - define Data Asset name" + ``` - +3. Add a Data Asset to the Data Source. - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - define Data Asset parameters" - ``` + Execute the following code to add a Data Asset to your Data Source: - + ```title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - add Data Asset" + ``` - + - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py - define Data Asset parameters" - ``` + - + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - full code example" + ``` - + + + + +## Create a Batch Definition + +Typically, a Batch Definition is used to describe how the data within a Data Asset should be retrieved. With dataframes, all of the data in a given dataframe will always be retrieved as a Batch. + +This means that Batch Definitions for dataframe Data Assets don't work to subdivide the data returned for validation. Instead, they serve as an additional layer of organization and allow you to further group your Validation Results. For example, if you have already used your dataframe Data Assets to group your Validation Results by pipeline stage, you could use two Batch Definitions to further group those results by having all automated validations use one Batch Definition and all manually executed validations use the other. + + +### Prerequisites + +- +- + - Optional. . +- . These examples assume the variable `context` contains your Data Context. +- A [pandas or Spark dataframe Data Asset](#create-a-data-asset). + +### Procedure + + -3. Add the Data Asset to the Data Source. + - Execute the following code to create a dataframe Data Asset and add it to your Data Source: +1. Optional. Retrieve your Data Asset. - ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py - create Data Asset" + If you do not already have a variable referencing your pandas or Spark Data Asset, you can retrieve a previously created Data Asset with: + + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py - retrieve Data Asset" ``` -4. Add a Batch Definition to the Data Asset. +2. Define the Batch Definition's parameters. + + A dataframe Batch Definition requires the following information: + + - `name`: A name by which the Batch Definition can be referenced. This should be unique among Batch Definitions on the Data Asset. - Dataframe Data Assets do not support further partitioning into Batches. A Batch Definition for a dataframe Data Asset will always have a single Batch available which contains all of the records in the Data Asset. Because of this you only need to provide a name when defining a dataframe Batch Definition: + Because dataframes are always provided in their entirety, dataframe Batch Definitions always use the `add_batch_definition_whole_dataframe()` method. - ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - add Batch Definition" + Update the value of `batch_definition_name` in the following code with something that describes your dataframe: + + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py - define Batch Definition name" ``` -5. Optional. Verify the Batch Definition. +3. Add the Batch Definition to the Data Asset. - You can verify that your Batch Definition can retrieve data from your dataframe by requesting the available Batch and printing the first few records: + Execute the following code to add a Batch Definition to your Data Asset: - ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py - verify Batch Definition" + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py - add Batch Definition" ``` + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_batch_definition.py - full code example" + ``` + + + + + +## Provide a dataframe through Batch Parameters + +Because dataframes exist in memory and cease to exist when a Python session ends the dataframe itself is not saved as part of a Data Assset or Batch Definition. Instead, a dataframe created in the current Python session is passed in at runtime as a Batch Parameter dictionary. + +### Prerequisites + +- +- + - Optional. . +- . These examples assume the variable `context` contains your Data Context. +- A [Batch Definition on a pandas or Spark dataframe Data Asset](#create-a-batch-definition). +- Data in a pandas or Spark dataframe. These examples assume the variable `dataframe` contains your pandas or Spark dataframe. +- Optional. A Validation Definition. + +### Procedure + +1. Define the Batch Parameter dictionary. + + A dataframe can be added to a Batch Parameter dictionary by defining it as the value of the dictionary key `dataframe`: + + ```python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py - batch parameters example" + ``` + + The following examples create a dataframe by reading a `.csv` file and stores it in a Batch Parameter dictionary: + - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_pandas_df_data_asset.py - full example" + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py - pandas dataframe" ``` - ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_spark_df_data_asset.py - full example" + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py - spark dataframe" ``` - +3. Pass the Batch Parameter dictionary to a `get_batch()` or `validate()` method call. - + Runtime Batch Parameters can be provided to the `get_batch()` method of a Batch Definition or to the `validate()` method of a Validation Definition. + + + + + + The `get_batch()` method of a Batch Definition retrieves a single Batch of data. Runtime Batch Parameters can be provided to the `get_batch()` method to specify the data returned as a Batch. The `validate()` method of this Batch can then be used to test individual Expectations. + + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_batch_definition.py - batch.validate() example" + ``` + + The results generated by `batch.validate()` are not persisted in storage. This workflow is solely intended for interactively creating Expectations and engaging in data Exploration. + + For further information on using an individual Batch to test Expectations see [Test an Expectation](/core/define_expectations/test_an_expectation.md). + + + + + + A Validation Definition's `run()` method validates an Expectation Suite against a Batch returned by a Batch Definition. Runtime Batch Parameters can be provided to a Validation Definition's `run()` method to specify the data returned in the Batch. This allows you to validate your dataframe by executing the Expectation Suite included in the Validation Definition. + + ```Python title="Python" name="docs/docusaurus/docs/core/connect_to_data/dataframes/_examples/_batch_parameters_validation_definition.py - validation_definition.run() example" + ``` + + For more information on Validation Definitions see [Run Validations](/core/run_validations/run_validations.md). + + + + \ No newline at end of file