From 10958627aa805eb375b403eb21c7f7d622e061c4 Mon Sep 17 00:00:00 2001 From: Josh Stauffer <66793731+joshua-stauffer@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:34:39 -0400 Subject: [PATCH 1/5] debug commit - revert when tests pass --- .../_examples/_directory_whole_directory.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py b/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py index d3910b47b675..18cf05c8cb16 100644 --- a/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py +++ b/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py @@ -1,5 +1,7 @@ # # +from pathlib import Path + import great_expectations as gx context = gx.get_context() @@ -9,6 +11,14 @@ file_data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name) # +print("--- ci debug ----") +print(file_data_asset) +ds = context.data_sources.get(data_source_name) +print(ds) +expected_data_dir = ds.base_directory / file_data_asset.data_directory +print(f"does data source data_directory exist? {expected_data_dir}") +print(Path.exists(expected_data_dir)) + # batch_definition_name = "yellow_tripdata" batch_definition = file_data_asset.add_batch_definition_whole_directory( From 6f92e16bffc02a16dc84852ec43c854749109f97 Mon Sep 17 00:00:00 2001 From: Josh Stauffer <66793731+joshua-stauffer@users.noreply.github.com> Date: Fri, 16 Aug 2024 20:15:16 -0400 Subject: [PATCH 2/5] use stable tmp dir for spark tests --- tests/integration/test_script_runner.py | 31 ++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py index 8a38e4c8ab81..5f857eae8c54 100644 --- a/tests/integration/test_script_runner.py +++ b/tests/integration/test_script_runner.py @@ -10,6 +10,7 @@ import os import pathlib import shutil +from tempfile import TemporaryDirectory from typing import List import pkg_resources @@ -356,9 +357,37 @@ def pytest_parsed_arguments(request): return request.config.option +@pytest.fixture +def spark_tmp_dir(): + """A Spark session relies on the directory in which it was started. That's a problem + for these tests, which create a bunch of random temporary directories, copy data, + copy a GX context, and try to run the docs script using the context & data. + It should be possible to stop the spark context, but for whatever reason that doesn't fix the + problem. + + The solution here is to pass Spark tests a temporary directory with a stable name. + Unfortunately, tempfile doesn't give us an API to do that, so we create a normal temp dir, + change its name to a stable path, yield for the test, and then rename the dir back to its + original name so tempfile can handle cleaning up. + + It's possible that Spark does something magical in the directory it's called in, so if + Spark tests start failing because things aren't being found, look here first. + """ + current_dir = pathlib.Path(__file__).parent.absolute() + tmp_dir_path = current_dir / "spark_tmp_test_dir" + with TemporaryDirectory() as temp_dir: + pathlib.Path.rename(pathlib.Path(temp_dir), tmp_dir_path) + yield tmp_dir_path + + pathlib.Path.rename(tmp_dir_path, temp_dir) + + @flaky(rerun_filter=delay_rerun, max_runs=3, min_passes=1) @pytest.mark.parametrize("integration_test_fixture", docs_test_matrix, ids=idfn) -def test_docs(integration_test_fixture, tmp_path, pytest_parsed_arguments): +def test_docs(integration_test_fixture, tmp_path, pytest_parsed_arguments, spark_tmp_dir): + if BackendDependencies.SPARK in integration_test_fixture.backend_dependencies: + tmp_path = spark_tmp_dir # see fixture docstring + _check_for_skipped_tests(pytest_parsed_arguments, integration_test_fixture) _execute_integration_test(integration_test_fixture, tmp_path) From 418a6903085e5118f52e2c9af5d3779da80c0be3 Mon Sep 17 00:00:00 2001 From: Josh Stauffer <66793731+joshua-stauffer@users.noreply.github.com> Date: Fri, 16 Aug 2024 20:15:49 -0400 Subject: [PATCH 3/5] Revert "debug commit - revert when tests pass" This reverts commit 10958627aa805eb375b403eb21c7f7d622e061c4. --- .../_examples/_directory_whole_directory.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py b/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py index 18cf05c8cb16..d3910b47b675 100644 --- a/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py +++ b/docs/docusaurus/docs/core/connect_to_data/filesystem_data/_create_a_batch_definition/_examples/_directory_whole_directory.py @@ -1,7 +1,5 @@ # # -from pathlib import Path - import great_expectations as gx context = gx.get_context() @@ -11,14 +9,6 @@ file_data_asset = context.data_sources.get(data_source_name).get_asset(data_asset_name) # -print("--- ci debug ----") -print(file_data_asset) -ds = context.data_sources.get(data_source_name) -print(ds) -expected_data_dir = ds.base_directory / file_data_asset.data_directory -print(f"does data source data_directory exist? {expected_data_dir}") -print(Path.exists(expected_data_dir)) - # batch_definition_name = "yellow_tripdata" batch_definition = file_data_asset.add_batch_definition_whole_directory( From 1520d340c020dac66bccceeb8eb68cce5f9359fa Mon Sep 17 00:00:00 2001 From: Josh Stauffer <66793731+joshua-stauffer@users.noreply.github.com> Date: Fri, 16 Aug 2024 20:39:58 -0400 Subject: [PATCH 4/5] Revert "use stable tmp dir for spark tests" This reverts commit 6f92e16bffc02a16dc84852ec43c854749109f97. --- tests/integration/test_script_runner.py | 31 +------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py index 5f857eae8c54..8a38e4c8ab81 100644 --- a/tests/integration/test_script_runner.py +++ b/tests/integration/test_script_runner.py @@ -10,7 +10,6 @@ import os import pathlib import shutil -from tempfile import TemporaryDirectory from typing import List import pkg_resources @@ -357,37 +356,9 @@ def pytest_parsed_arguments(request): return request.config.option -@pytest.fixture -def spark_tmp_dir(): - """A Spark session relies on the directory in which it was started. That's a problem - for these tests, which create a bunch of random temporary directories, copy data, - copy a GX context, and try to run the docs script using the context & data. - It should be possible to stop the spark context, but for whatever reason that doesn't fix the - problem. - - The solution here is to pass Spark tests a temporary directory with a stable name. - Unfortunately, tempfile doesn't give us an API to do that, so we create a normal temp dir, - change its name to a stable path, yield for the test, and then rename the dir back to its - original name so tempfile can handle cleaning up. - - It's possible that Spark does something magical in the directory it's called in, so if - Spark tests start failing because things aren't being found, look here first. - """ - current_dir = pathlib.Path(__file__).parent.absolute() - tmp_dir_path = current_dir / "spark_tmp_test_dir" - with TemporaryDirectory() as temp_dir: - pathlib.Path.rename(pathlib.Path(temp_dir), tmp_dir_path) - yield tmp_dir_path - - pathlib.Path.rename(tmp_dir_path, temp_dir) - - @flaky(rerun_filter=delay_rerun, max_runs=3, min_passes=1) @pytest.mark.parametrize("integration_test_fixture", docs_test_matrix, ids=idfn) -def test_docs(integration_test_fixture, tmp_path, pytest_parsed_arguments, spark_tmp_dir): - if BackendDependencies.SPARK in integration_test_fixture.backend_dependencies: - tmp_path = spark_tmp_dir # see fixture docstring - +def test_docs(integration_test_fixture, tmp_path, pytest_parsed_arguments): _check_for_skipped_tests(pytest_parsed_arguments, integration_test_fixture) _execute_integration_test(integration_test_fixture, tmp_path) From 06ba2471ba8c8f90a4b1840ae18492b4d4389a4a Mon Sep 17 00:00:00 2001 From: Tyler Hoffman Date: Mon, 19 Aug 2024 09:42:52 -0400 Subject: [PATCH 5/5] [MAINTENANCE] make docs tests run first to unblock spark tests --- tests/integration/test_script_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_script_runner.py b/tests/integration/test_script_runner.py index 8a38e4c8ab81..65d2489dd2c7 100644 --- a/tests/integration/test_script_runner.py +++ b/tests/integration/test_script_runner.py @@ -319,6 +319,7 @@ def delay_rerun(*args): # populate docs_test_matrix with sub-lists +docs_test_matrix += docs_tests # this has to go first. TODO: Fix in V1-481 docs_test_matrix += local_tests docs_test_matrix += quickstart docs_test_matrix += fluent_datasources @@ -338,7 +339,6 @@ def delay_rerun(*args): docs_test_matrix += aws_glue_integration_tests docs_test_matrix += multiple_backend docs_test_matrix += failed_rows_tests -docs_test_matrix += docs_tests pandas_integration_tests: List[IntegrationTestFixture] = []