From c9ddc2e84e72144f06deb4b312f9cabe3e79be90 Mon Sep 17 00:00:00 2001 From: Finn van Krieken Date: Fri, 20 Dec 2024 13:35:58 -0500 Subject: [PATCH] chk test validate --- dcpy/test/lifecycle/ingest/shared.py | 37 +++++++++++++ dcpy/test/lifecycle/ingest/test_validate.py | 57 ++++++++++++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/dcpy/test/lifecycle/ingest/shared.py b/dcpy/test/lifecycle/ingest/shared.py index b6995ba01..98aef2cbf 100644 --- a/dcpy/test/lifecycle/ingest/shared.py +++ b/dcpy/test/lifecycle/ingest/shared.py @@ -1,20 +1,30 @@ +from datetime import datetime from pathlib import Path from dcpy.models.connectors.edm.publishing import GisDataset +from dcpy.models.connectors.edm.recipes import Dataset +from dcpy.models import file, library from dcpy.models.connectors import socrata, web from dcpy.models.lifecycle.ingest import ( LocalFileSource, ScriptSource, S3Source, DEPublished, + DatasetAttributes, + ArchivalMetadata, + Ingestion, + Config, ) +from dcpy.utils.metadata import get_run_details from dcpy.test.conftest import RECIPES_BUCKET RESOURCES = Path(__file__).parent / "resources" TEMPLATE_DIR = RESOURCES / "templates" TEST_DATA_DIR = "test_data" +TEST_OUTPUT = RESOURCES / TEST_DATA_DIR / "output.parquet" TEST_DATASET_NAME = "test_dataset" FAKE_VERSION = "20240101" +TEST_DATASET = Dataset(id=TEST_DATASET_NAME, version=FAKE_VERSION) class Sources: @@ -39,6 +49,33 @@ class Sources: ) +BASIC_CONFIG = Config( + id=TEST_DATASET_NAME, + version=FAKE_VERSION, + attributes=DatasetAttributes(name=TEST_DATASET_NAME), + archival=ArchivalMetadata( + archival_timestamp=datetime(2024, 1, 1), + raw_filename="dummy.txt", + acl="public-read", + ), + ingestion=Ingestion(source=Sources.local_file, file_format=file.Csv(type="csv")), + run_details=get_run_details(), +) + +BASIC_LIBRARY_CONFIG = library.Config( + dataset=library.DatasetDefinition( + name=TEST_DATASET_NAME, + version=FAKE_VERSION, + acl="public-read", + source=library.DatasetDefinition.SourceSection(), + destination=library.DatasetDefinition.DestinationSection( + geometry=library.GeometryType(SRS="NONE", type="NONE") + ), + ), + execution_details=get_run_details(), +) + + SOURCE_FILENAMES = [ (Sources.local_file, "dummy.txt"), (Sources.gis, f"{TEST_DATASET_NAME}.zip"), diff --git a/dcpy/test/lifecycle/ingest/test_validate.py b/dcpy/test/lifecycle/ingest/test_validate.py index 70fbc2261..13e74eb4a 100644 --- a/dcpy/test/lifecycle/ingest/test_validate.py +++ b/dcpy/test/lifecycle/ingest/test_validate.py @@ -1,15 +1,68 @@ +from io import BytesIO +import json import pytest import yaml +from dcpy.test.conftest import RECIPES_BUCKET from dcpy.models.lifecycle.ingest import Template -from dcpy.lifecycle.ingest import configure, transform +from dcpy.utils import s3 +from dcpy.connectors.edm import recipes +from dcpy.lifecycle.ingest import configure, transform, validate + +from .shared import ( + TEST_DATASET, + TEST_OUTPUT, + BASIC_CONFIG, + BASIC_LIBRARY_CONFIG, +) @pytest.mark.parametrize("dataset", [t.name for t in configure.TEMPLATE_DIR.glob("*")]) -def test_validate_all_datasets(dataset): +def test_validate_all_templates(dataset): with open(configure.TEMPLATE_DIR / dataset, "r") as f: s = yaml.safe_load(f) template = Template(**s) transform.validate_processing_steps( template.id, template.ingestion.processing_steps ) + + +class TestValidateAgainstExistingVersions: + def test_new(self, create_buckets): + assert ( + validate.validate_against_existing_versions(TEST_DATASET, TEST_OUTPUT) + == validate.ArchiveAction.push + ) + + def test_existing_library(self, create_buckets): + ds = BASIC_LIBRARY_CONFIG.sparse_dataset + config_str = json.dumps(BASIC_LIBRARY_CONFIG.model_dump(mode="json")) + s3.upload_file_obj( + BytesIO(config_str.encode()), + RECIPES_BUCKET, + f"{recipes.s3_folder_path(ds)}/config.json", + BASIC_LIBRARY_CONFIG.dataset.acl, + ) + assert recipes.exists(ds) + assert ( + validate.validate_against_existing_versions(ds, TEST_OUTPUT) + == validate.ArchiveAction.do_nothing + ) + + def test_existing(self, create_buckets): + ds = BASIC_CONFIG.dataset + recipes.archive_dataset(BASIC_CONFIG, TEST_OUTPUT) + assert recipes.exists(ds) + assert ( + validate.validate_against_existing_versions(ds, TEST_OUTPUT) + == validate.ArchiveAction.update_freshness + ) + + def test_existing_data_diffs(self, create_buckets): + ds = BASIC_CONFIG.dataset + recipes.archive_dataset(BASIC_CONFIG, TEST_OUTPUT) + assert recipes.exists(ds) + with pytest.raises(FileExistsError): + validate.validate_against_existing_versions( + ds, TEST_OUTPUT.parent / "test.parquet" + )