From 90bda47ea3496f4124bfac9e550c75ff30035e16 Mon Sep 17 00:00:00 2001 From: earmenda Date: Wed, 5 Jan 2022 11:44:23 -0800 Subject: [PATCH] Implement `scan database` command (#292) * Implement database_coverage command * Add tests for find_missing_dataset_fields * Change database-coverage to only count uncategorized fields * Fix issues merging from main * Change command to use new name / format * Add integration test for database_coverage * Change command to use percent not decimal * Add manifest-dir option and check for all datasets now * Small fixes and ci violations * Fix broken tests * doc strings updates * moved the coverage percentage to the bottom of the error message * add integration tests for manifest, update coverage used in tests to int * Apply suggestions from code review small docs change Co-authored-by: Phil Salant * Address comments, mostly refactoring * Fix command formatting after some testing * a few output nits * Update docs to include a guide for using the scan command * Rename guides to How-To Guides Co-authored-by: Eduardo Armendariz Co-authored-by: Thomas La Piana Co-authored-by: SteveDMurphy Co-authored-by: Thomas La Piana Co-authored-by: Phil Salant --- docs/fides/docs/guides/generate_dataset.md | 69 ++++ docs/fides/docs/guides/scan_resource.md | 81 ++++ docs/fides/mkdocs.yml | 3 + fidesctl/src/fidesctl/cli/__init__.py | 2 + fidesctl/src/fidesctl/cli/cli.py | 33 ++ .../src/fidesctl/core/generate_dataset.py | 191 ++++++++- fidesctl/test_file.yml | 39 -- fidesctl/tests/core/test_generate_dataset.py | 363 ++++++++++++++++-- 8 files changed, 686 insertions(+), 95 deletions(-) create mode 100644 docs/fides/docs/guides/generate_dataset.md create mode 100644 docs/fides/docs/guides/scan_resource.md delete mode 100644 fidesctl/test_file.yml diff --git a/docs/fides/docs/guides/generate_dataset.md b/docs/fides/docs/guides/generate_dataset.md new file mode 100644 index 0000000000..3a97de1cf4 --- /dev/null +++ b/docs/fides/docs/guides/generate_dataset.md @@ -0,0 +1,69 @@ +# Generating a Dataset + +As an alternative to manually creating dataset resource files like in our [tutorial](../tutorial/dataset.md), it is possible to generate these files using the `generate-dataset` CLI command. The CLI will connect to a given resource and automatically generate a non-annotated resource YAML file in the specified location, based on the database schema. + +Not only is this the simplest way to begin annotating your resources, but it also follows the expected fidesctl format for these resources. This is important as some commands, like `scan`, expect resources to follow this format. + +# Working With a Database + +The `generate-dataset` command can connect to a database and automatically generate resource YAML file. Given a database schema with a single `users` table as follows: + +```shell +flaskr=# SELECT * FROM users; + id | created_at | email | password | first_name | last_name +----+---------------------+-------------------+------------------------------------+------------+----------- + 1 | 2020-01-01 00:00:00 | admin@example.com | pbkdf2:sha256:260000$O87nanbSkl... | Admin | User + 2 | 2020-01-03 00:00:00 | user@example.com | pbkdf2:sha256:260000$PGcBy5NzZe... | Example | User +(2 rows) +``` + +We can invoke the `generate-dataset` by simply providing a connection url for this database: +```sh +./venv/bin/fidesctl generate-dataset \ + postgresql://postgres:postgres@localhost:5432/flaskr \ + fides_resources/flaskr_postgres_dataset.yml +``` + +The result is a resource file with a dataset with collections and fields to represent our schema: +``` +dataset: +- fides_key: public + organization_fides_key: default_organization + name: public + description: 'Fides Generated Description for Schema: public' + meta: null + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + collections: + - name: public.users + description: 'Fides Generated Description for Table: public.users' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + fields: + - name: created_at + description: 'Fides Generated Description for Column: created_at' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: email + description: 'Fides Generated Description for Column: email' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: first_name + description: 'Fides Generated Description for Column: first_name' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: id + description: 'Fides Generated Description for Column: id' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: last_name + description: 'Fides Generated Description for Column: last_name' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: password + description: 'Fides Generated Description for Column: password' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified +``` + +The resulting file still requires annotating the dataset with data categories to represent what is stored. diff --git a/docs/fides/docs/guides/scan_resource.md b/docs/fides/docs/guides/scan_resource.md new file mode 100644 index 0000000000..32ec1ec658 --- /dev/null +++ b/docs/fides/docs/guides/scan_resource.md @@ -0,0 +1,81 @@ +# Scanning a Resource + +As you annotate resources with fidesctl it is important to keep your fidesctl resources up to date. The `scan` command is available to compare your resources and what is defined in your fidesctl server or resource files. It will output any part of the dataset which is not defined or categorized. The command will exit in error if a coverage threshold is not met. + +The `scan` command works best when used in tandem with the `generate-dataset` command as it creates resources in the expected format. The fidesctl format for datasets must be followed in order to be able to track coverage. + +# Scanning a Database + +The `scan` command can connect to a database and compare its schema to your already defined resources. Given a database schema with a single `users` table as follows: + +```shell +flaskr=# SELECT * FROM users; + id | created_at | email | password | first_name | last_name +----+---------------------+-------------------+------------------------------------+------------+----------- + 1 | 2020-01-01 00:00:00 | admin@example.com | pbkdf2:sha256:260000$O87nanbSkl... | Admin | User + 2 | 2020-01-03 00:00:00 | user@example.com | pbkdf2:sha256:260000$PGcBy5NzZe... | Example | User +(2 rows) +``` + +We have fully annotated this schema before with the following dataset resource file: +``` +dataset: +- fides_key: public + organization_fides_key: default_organization + name: public + description: 'Fides Generated Description for Schema: public' + meta: null + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + collections: + - name: public.users + description: 'Fides Generated Description for Table: public.users' + data_categories: [] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + fields: + - name: created_at + description: 'Fides Generated Description for Column: created_at' + data_categories: [system.operations] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: email + description: 'Fides Generated Description for Column: email' + data_categories: [user.provided.identifiable.contact.email] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: first_name + description: 'Fides Generated Description for Column: first_name' + data_categories: [user.provided.identifiable.name] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: id + description: 'Fides Generated Description for Column: id' + data_categories: [user.derived.identifiable.unique_id] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: last_name + description: 'Fides Generated Description for Column: last_name' + data_categories: [user.provided.identifiable.name] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified + - name: password + description: 'Fides Generated Description for Column: password' + data_categories: [user.provided.identifiable.credentials.password] + data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified +``` + +fidesctl scan --manifest-dir dataset.yml database postgresql+psycopg2://postgres:fidesctl@fidesctl-db:5432/postgres + + +We can invoke the `scan` by simply providing a connection url for this database: +```sh +./venv/bin/fidesctl scan \ + --manifest-dir dataset.yml \ + database \ + postgresql+psycopg2://postgres:fidesctl@fidesctl-db:5432/postgres +``` + +The command output confirms our database resource is covered fully! +```sh +Loading resource manifests from: dataset.yml +Taxonomy successfully created. +Successfully scanned the following datasets: + public + +Annotation coverage: 100% +``` diff --git a/docs/fides/mkdocs.yml b/docs/fides/mkdocs.yml index e6c4a1daa9..2028e0a076 100644 --- a/docs/fides/mkdocs.yml +++ b/docs/fides/mkdocs.yml @@ -30,6 +30,9 @@ nav: - Write a Policy: tutorial/policy.md - Add Google Analytics: tutorial/google.md - Manage Google Analytics with Fidesctl: tutorial/pass.md + - How-To Guides: + - Generating a Dataset: guides/generate_dataset.md + - Scanning a Resource: guides/scan_resource.md - CI/CD Reference: ci_reference.md - Fides Language: - Overview: language/overview.md diff --git a/fidesctl/src/fidesctl/cli/__init__.py b/fidesctl/src/fidesctl/cli/__init__.py index 2a0638db1c..7a6519f882 100644 --- a/fidesctl/src/fidesctl/cli/__init__.py +++ b/fidesctl/src/fidesctl/cli/__init__.py @@ -7,6 +7,7 @@ delete, evaluate, generate_dataset, + scan, annotate_dataset, get, init_db, @@ -26,6 +27,7 @@ apply, delete, generate_dataset, + scan, get, init_db, ls, diff --git a/fidesctl/src/fidesctl/cli/cli.py b/fidesctl/src/fidesctl/cli/cli.py index d95325f0c7..78c8e8eefe 100644 --- a/fidesctl/src/fidesctl/cli/cli.py +++ b/fidesctl/src/fidesctl/cli/cli.py @@ -140,6 +140,39 @@ def generate_dataset( _generate_dataset.generate_dataset(connection_string, output_filename) +@click.command() +@click.pass_context +@click.argument("source_type", type=click.Choice(["database"])) +@click.argument("connection_string", type=str) +@click.option("-m", "--manifest-dir", type=str, default="") +@click.option("-c", "--coverage-threshold", type=click.IntRange(0, 100), default=100) +def scan( + ctx: click.Context, + source_type: str, + connection_string: str, + manifest_dir: str, + coverage_threshold: int, +) -> None: + """ + Connect to a database directly via a SQLAlchemy-stlye connection string and + compare the database objects to existing datasets. + + If there are fields within the database that aren't listed and categorized + within one of the datasets, this counts as lacking coverage. + + Outputs missing fields and has a non-zero exit if coverage is + under the stated threshold. + """ + config = ctx.obj["CONFIG"] + _generate_dataset.database_coverage( + connection_string=connection_string, + manifest_dir=manifest_dir, + coverage_threshold=coverage_threshold, + url=config.cli.server_url, + headers=config.user.request_headers, + ) + + @click.command() @click.pass_context @click.argument("input_filename", type=str) diff --git a/fidesctl/src/fidesctl/core/generate_dataset.py b/fidesctl/src/fidesctl/core/generate_dataset.py index fcf5862893..01d7bfb700 100644 --- a/fidesctl/src/fidesctl/core/generate_dataset.py +++ b/fidesctl/src/fidesctl/core/generate_dataset.py @@ -1,12 +1,15 @@ """Module that generates valid dataset manifest files from various data sources.""" -from typing import Dict, List +from typing import Dict, List, Tuple, Optional, Callable import sqlalchemy from sqlalchemy.engine import Engine +from pydantic import AnyHttpUrl +from fidesctl.core.api_helpers import get_server_resource +from fidesctl.core.parse import parse from fideslang import manifests -from fideslang.models import Dataset, DatasetCollection, DatasetField -from .utils import get_db_engine, echo_green +from fideslang.models import Dataset, DatasetCollection, DatasetField, Taxonomy +from .utils import get_db_engine, echo_green, echo_red def get_postgres_collections_and_fields( @@ -82,6 +85,21 @@ def get_mssql_collections_and_fields(engine: Engine) -> Dict[str, Dict[str, List return db_tables +def get_db_collections_and_fields(engine: Engine) -> Dict[str, Dict[str, List[str]]]: + """ + Returns a database collections and fields, delegating to a specific engine dialect function + """ + database_ingestion_functions: Dict[ + str, Callable[[Engine], Dict[str, Dict[str, List[str]]]] + ] = { + "postgresql": get_postgres_collections_and_fields, + "mysql": get_mysql_collections_and_fields, + "mssql": get_mssql_collections_and_fields, + } + collections_and_fields = database_ingestion_functions[engine.dialect.name](engine) + return collections_and_fields + + def create_dataset_collections( db_tables: Dict[str, Dict[str, List[str]]] ) -> List[Dataset]: @@ -95,10 +113,12 @@ def create_dataset_collections( fides_key=schema_name, name=schema_name, description=f"Fides Generated Description for Schema: {schema_name}", + data_categories=[], collections=[ DatasetCollection( name=table_name, description=f"Fides Generated Description for Table: {table_name}", + data_categories=[], fields=[ DatasetField( name=column, @@ -116,20 +136,155 @@ def create_dataset_collections( return table_manifests -def create_dataset(engine: Engine, collections: List[DatasetCollection]) -> Dataset: +def find_uncategorized_dataset_fields( + dataset_key: str, dataset: Optional[Dataset], db_dataset: Dict[str, List[str]] +) -> Tuple[List[str], int]: + """ + Given an object that represents a database dataset definition, finds + uncategorized keys and coverage ratio. + """ + uncategorized_fields = [] + total_field_count = 0 + + for db_dataset_collection in db_dataset.keys(): + dataset_collection = ( + next( + ( + collection + for collection in dataset.collections + if collection.name == db_dataset_collection + ), + None, + ) + if dataset + else None + ) + + for db_dataset_field in db_dataset.get(db_dataset_collection, []): + total_field_count += 1 + field_uncategorized = ( + all( + field.name != db_dataset_field or not field.data_categories + for field in dataset_collection.fields + ) + if dataset_collection + else True + ) + + if field_uncategorized: + uncategorized_fields.append( + f"{dataset_key}.{db_dataset_collection}.{db_dataset_field}" + ) + return uncategorized_fields, total_field_count + + +def find_all_uncategorized_dataset_fields( + manifest_taxonomy: Optional[Taxonomy], + db_collections: Dict[str, Dict[str, List[str]]], + url: AnyHttpUrl, + headers: Dict[str, str], +) -> Tuple[List[str], int]: + """ + Finds all uncategorized fields given a database modeled object. Datasets + are pulled from the server unless a manifest taxonomy is supplied. + """ + uncategorized_fields = [] + total_field_count = 0 + for db_dataset_key in db_collections.keys(): + dataset = ( + next( + ( + dataset + for dataset in manifest_taxonomy.dataset + if dataset.fides_key == db_dataset_key + ), + None, + ) + if manifest_taxonomy + else get_server_resource( + url=url, + resource_type="dataset", + resource_key=db_dataset_key, + headers=headers, + ) + ) + db_dataset = db_collections.get(db_dataset_key, {}) + ( + current_uncategorized_keys, + current_field_count, + ) = find_uncategorized_dataset_fields( + dataset_key=db_dataset_key, dataset=dataset, db_dataset=db_dataset + ) + total_field_count += current_field_count + uncategorized_fields += current_uncategorized_keys + + return uncategorized_fields, total_field_count + + +def print_database_coverage_result( + datasets: List[str], + uncategorized_fields: List[str], + coverage_percent: int, + coverage_threshold: int, +) -> None: """ - Generate a partial dataset manifest, sans tables/fields, - given a database engine. + Prints uncategorized fields and raises an exception if coverage + is lower than provided threshold. """ - url = engine.url - name = url.database - dataset = Dataset( - fides_key=name, - name=name, - description=f"Fides Generated Description for Dataset: {name}", - collections=collections, + output: str = "Successfully scanned the following datasets:\n" + output += "\t{}\n".format("\n\t".join(datasets)) + echo_green(output) + + if uncategorized_fields: + uncategorized_output = ( + "The following fields are missing data category annotations:\n" + ) + uncategorized_output += "\t{}\n".format("\n\t".join(uncategorized_fields)) + echo_red(uncategorized_output) + annotation_output = "Annotation coverage: {}%".format(coverage_percent) + if coverage_percent < coverage_threshold: + echo_red(annotation_output) + raise SystemExit(1) + echo_green(annotation_output) + + +def database_coverage( + connection_string: str, + manifest_dir: Optional[str], + coverage_threshold: int, + url: AnyHttpUrl, + headers: Dict[str, str], +) -> None: + """ + Given a database connection string, fetches collections + and fields and compares them to existing datasets or datasets in a + local manifest (if one is provided). + """ + manifest_taxonomy = parse(manifest_dir) if manifest_dir else None + + # Generate the collections and fields for the target database + db_engine = get_db_engine(connection_string) + db_collections = get_db_collections_and_fields(db_engine) + + uncategorized_fields, db_field_count = find_all_uncategorized_dataset_fields( + manifest_taxonomy=manifest_taxonomy, + db_collections=db_collections, + url=url, + headers=headers, + ) + if db_field_count < 1: + echo_red("Database did not contain any dataset fields to evaluate coverage") + raise SystemExit(1) + + coverage_percent = int( + ((db_field_count - len(uncategorized_fields)) / db_field_count) * 100 + ) + print_database_coverage_result( + datasets=list(db_collections.keys()), + uncategorized_fields=uncategorized_fields, + coverage_percent=coverage_percent, + coverage_threshold=coverage_threshold, ) - return dataset def generate_dataset(connection_string: str, file_name: str) -> str: @@ -137,14 +292,8 @@ def generate_dataset(connection_string: str, file_name: str) -> str: Given a database connection string, extract all tables/fields from it and write out a boilerplate dataset manifest. """ - database_ingestion_functions = { - "postgresql": get_postgres_collections_and_fields, - "mysql": get_mysql_collections_and_fields, - "mssql": get_mssql_collections_and_fields, - } - db_engine = get_db_engine(connection_string) - db_collections = database_ingestion_functions[db_engine.dialect.name](db_engine) + db_collections = get_db_collections_and_fields(db_engine) collections = create_dataset_collections(db_collections) manifests.write_manifest(file_name, [i.dict() for i in collections], "dataset") echo_green(f"Generated dataset manifest written to {file_name}") diff --git a/fidesctl/test_file.yml b/fidesctl/test_file.yml deleted file mode 100644 index 67a6061451..0000000000 --- a/fidesctl/test_file.yml +++ /dev/null @@ -1,39 +0,0 @@ -dataset: -- fides_key: mysql_example - organization_fides_key: default_organization - name: mysql_example - description: 'Fides Generated Description for Schema: mysql_example' - meta: null - data_categories: null - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - collections: - - name: login - description: 'Fides Generated Description for Table: login' - data_categories: null - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - fields: - - name: customer_id - description: 'Fides Generated Description for Column: customer_id' - data_categories: [] - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - - name: id - description: 'Fides Generated Description for Column: id' - data_categories: [] - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - - name: time - description: 'Fides Generated Description for Column: time' - data_categories: [] - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - - name: visit - description: 'Fides Generated Description for Table: visit' - data_categories: null - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - fields: - - name: email - description: 'Fides Generated Description for Column: email' - data_categories: [] - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified - - name: last_visit - description: 'Fides Generated Description for Column: last_visit' - data_categories: [] - data_qualifier: aggregated.anonymized.unlinked_pseudonymized.pseudonymized.identified diff --git a/fidesctl/tests/core/test_generate_dataset.py b/fidesctl/tests/core/test_generate_dataset.py index ce5a91575b..d11da8c487 100644 --- a/fidesctl/tests/core/test_generate_dataset.py +++ b/fidesctl/tests/core/test_generate_dataset.py @@ -1,8 +1,9 @@ import sqlalchemy import pytest +from typing import List, Dict - -from fidesctl.core import generate_dataset +from fidesctl.core import generate_dataset, api +from fideslang.manifests import write_manifest from fideslang.models import Dataset, DatasetCollection, DatasetField @@ -18,6 +19,29 @@ MASTER_MSSQL_URL = MSSQL_URL_TEMPLATE.format("master") + "&autocommit=True" +def create_server_datasets(test_config, datasets: List[Dataset]): + for dataset in datasets: + api.delete( + url=test_config.cli.server_url, + resource_type="dataset", + resource_id=dataset.fides_key, + headers=test_config.user.request_headers, + ) + api.create( + url=test_config.cli.server_url, + resource_type="dataset", + json_resource=dataset.json(exclude_none=True), + headers=test_config.user.request_headers, + ) + + +def set_field_data_categories(datasets: List[Dataset], category: str): + for dataset in datasets: + for collection in dataset.collections: + for field in collection.fields: + field.data_categories.append(category) + + @pytest.fixture() def test_dataset(): collections = [ @@ -76,11 +100,13 @@ def test_generate_dataset_collections(): Dataset( name="ds", fides_key="ds", + data_categories=[], description="Fides Generated Description for Schema: ds", collections=[ DatasetCollection( name="foo", description="Fides Generated Description for Table: foo", + data_categories=[], fields=[ DatasetField( name=1, @@ -97,6 +123,7 @@ def test_generate_dataset_collections(): DatasetCollection( name="bar", description="Fides Generated Description for Table: bar", + data_categories=[], fields=[ DatasetField( name=4, @@ -118,13 +145,124 @@ def test_generate_dataset_collections(): @pytest.mark.unit -def test_generate_dataset_info(test_dataset): - test_url = "postgresql+psycopg2://fidesdb:fidesdb@fidesdb:5432/fidesdb" - test_engine = sqlalchemy.create_engine(test_url) - actual_result = generate_dataset.create_dataset( - test_engine, test_dataset.collections +def test_find_uncategorized_dataset_fields_all_categorized(): + test_resource = {"ds": {"foo": ["1", "2"], "bar": ["4", "5"]}} + dataset = Dataset( + name="ds", + fides_key="ds", + collections=[ + DatasetCollection( + name="foo", + fields=[ + DatasetField( + name=1, + data_categories=["category_1"], + ), + DatasetField( + name=2, + data_categories=["category_1"], + ), + ], + ), + DatasetCollection( + name="bar", + fields=[ + DatasetField( + name=4, + data_categories=["category_1"], + ), + DatasetField(name=5, data_categories=["category_1"]), + ], + ), + ], ) - assert actual_result == test_dataset + uncategorized_keys, total_field_count = generate_dataset.find_uncategorized_dataset_fields( + dataset_key="ds", dataset=dataset, db_dataset=test_resource.get("ds") + ) + assert not uncategorized_keys + assert total_field_count == 4 + + +@pytest.mark.unit +def test_find_uncategorized_dataset_fields_uncategorized_fields(): + test_resource = {"ds": {"foo": ["1", "2"]}} + dataset = Dataset( + name="ds", + fides_key="ds", + data_categories=["category_1"], + collections=[ + DatasetCollection( + name="foo", + data_categories=["category_1"], + fields=[ + DatasetField( + name=1, + data_categories=["category_1"], + ), + DatasetField(name=2), + ], + ) + ], + ) + uncategorized_keys, total_field_count = generate_dataset.find_uncategorized_dataset_fields( + dataset_key="ds", dataset=dataset, db_dataset=test_resource.get("ds") + ) + assert set(uncategorized_keys) == {"ds.foo.2"} + assert total_field_count == 2 + + +@pytest.mark.unit +def test_find_uncategorized_dataset_fields_missing_field(): + test_resource = {"ds": {"bar": ["4", "5"]}} + dataset = Dataset( + name="ds", + fides_key="ds", + collections=[ + DatasetCollection( + name="bar", + fields=[ + DatasetField( + name=4, + data_categories=["category_1"], + ) + ], + ), + ], + ) + uncategorized_keys, total_field_count = generate_dataset.find_uncategorized_dataset_fields( + dataset_key="ds", dataset=dataset, db_dataset=test_resource.get("ds") + ) + assert set(uncategorized_keys) == {"ds.bar.5"} + assert total_field_count == 2 + + +@pytest.mark.unit +def test_find_uncategorized_dataset_fields_missing_collection(): + test_resource = {"ds": {"foo": ["1", "2"], "bar": ["4", "5"]}} + dataset = Dataset( + name="ds", + fides_key="ds", + collections=[ + DatasetCollection( + name="bar", + fields=[ + DatasetField( + name=4, + data_categories=["category_1"], + ), + DatasetField( + name=5, + data_categories=["category_1"], + ), + ], + ), + ], + ) + uncategorized_keys, total_field_count = generate_dataset.find_uncategorized_dataset_fields( + dataset_key="ds", dataset=dataset, db_dataset=test_resource.get("ds") + ) + assert set(uncategorized_keys) == {"ds.foo.1", "ds.foo.2"} + assert total_field_count == 4 @pytest.mark.unit @@ -136,6 +274,13 @@ def test_unsupported_dialect_error(): @pytest.mark.postgres class TestPostgres: + EXPECTED_COLLECTION = { + "public": { + "public.visit": ["email", "last_visit"], + "public.login": ["id", "customer_id", "time"], + } + } + @pytest.fixture(scope="class", autouse=True) def postgres_setup(self): "Set up the Postgres Database for testing." @@ -148,23 +293,75 @@ def postgres_setup(self): @pytest.mark.integration def test_get_db_tables_postgres(self): engine = sqlalchemy.create_engine(POSTGRES_URL) - expected_result = { - "public": { - "public.visit": ["email", "last_visit"], - "public.login": ["id", "customer_id", "time"], - } - } actual_result = generate_dataset.get_postgres_collections_and_fields(engine) - assert actual_result == expected_result + assert actual_result == TestPostgres.EXPECTED_COLLECTION @pytest.mark.integration - def test_generate_dataset_postgres(self): - actual_result = generate_dataset.generate_dataset(POSTGRES_URL, "test_file.yml") + def test_generate_dataset_postgres(self, tmpdir): + actual_result = generate_dataset.generate_dataset( + POSTGRES_URL, f"{tmpdir}/test_file.yml" + ) assert actual_result + @pytest.mark.integration + def test_generate_dataset_passes_postgres(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestPostgres.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=POSTGRES_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_generate_dataset_coverage_failure_postgres(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestPostgres.EXPECTED_COLLECTION + ) + create_server_datasets(test_config, datasets) + with pytest.raises(SystemExit): + generate_dataset.database_coverage( + connection_string=POSTGRES_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_dataset_coverage_manifest_passes_postgres(self, test_config, tmpdir): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestPostgres.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + + file_name = tmpdir.join("dataset.yml") + write_manifest(file_name, [i.dict() for i in datasets], "dataset") + + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=POSTGRES_URL, + manifest_dir=f"{tmpdir}", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + @pytest.mark.mysql class TestMySQL: + EXPECTED_COLLECTION = { + "mysql_example": { + "visit": ["email", "last_visit"], + "login": ["id", "customer_id", "time"], + } + } + @pytest.fixture(scope="class", autouse=True) def mysql_setup(self): """ @@ -183,23 +380,74 @@ def mysql_setup(self): @pytest.mark.integration def test_get_db_tables_mysql(self): engine = sqlalchemy.create_engine(MYSQL_URL) - expected_result = { - "mysql_example": { - "visit": ["email", "last_visit"], - "login": ["id", "customer_id", "time"], - } - } actual_result = generate_dataset.get_mysql_collections_and_fields(engine) - assert actual_result == expected_result + assert actual_result == TestMySQL.EXPECTED_COLLECTION @pytest.mark.integration - def test_generate_dataset_mysql(self): - actual_result = generate_dataset.generate_dataset(MYSQL_URL, "test_file.yml") + def test_generate_dataset_mysql(self, tmpdir): + actual_result = generate_dataset.generate_dataset( + MYSQL_URL, f"{tmpdir}test_file.yml" + ) assert actual_result + @pytest.mark.integration + def test_generate_dataset_passes_mysql(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestMySQL.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=MYSQL_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_generate_dataset_coverage_failure_mysql(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestMySQL.EXPECTED_COLLECTION + ) + create_server_datasets(test_config, datasets) + with pytest.raises(SystemExit): + generate_dataset.database_coverage( + connection_string=MYSQL_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_dataset_coverage_manifest_passes_mysql(self, test_config, tmpdir): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestMySQL.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + + file_name = tmpdir.join("dataset.yml") + write_manifest(file_name, [i.dict() for i in datasets], "dataset") + + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=MYSQL_URL, + manifest_dir=f"{tmpdir}", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) @pytest.mark.mssql class TestSQLServer: + EXPECTED_COLLECTION = { + "dbo": { + "visit": ["email", "last_visit"], + "login": ["id", "customer_id", "time"], + } + } + @pytest.fixture(scope="class", autouse=True) def mssql_setup(self): """ @@ -219,16 +467,61 @@ def mssql_setup(self): @pytest.mark.integration def test_get_db_tables_mssql(self): engine = sqlalchemy.create_engine(MSSQL_URL) - expected_result = { - "dbo": { - "visit": ["email", "last_visit"], - "login": ["id", "customer_id", "time"], - } - } actual_result = generate_dataset.get_mssql_collections_and_fields(engine) - assert actual_result == expected_result + assert actual_result == TestSQLServer.EXPECTED_COLLECTION @pytest.mark.integration - def test_generate_dataset_mssql(self): - actual_result = generate_dataset.generate_dataset(MSSQL_URL, "test_file.yml") + def test_generate_dataset_mssql(self, tmpdir): + actual_result = generate_dataset.generate_dataset( + MSSQL_URL, f"{tmpdir}/test_file.yml" + ) assert actual_result + + @pytest.mark.integration + def test_generate_dataset_passes_mssql(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestSQLServer.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=MSSQL_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_generate_dataset_coverage_failure_mssql(self, test_config): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestSQLServer.EXPECTED_COLLECTION + ) + create_server_datasets(test_config, datasets) + with pytest.raises(SystemExit): + generate_dataset.database_coverage( + connection_string=MSSQL_URL, + manifest_dir="", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + ) + + @pytest.mark.integration + def test_dataset_coverage_manifest_passes_mssql(self, test_config, tmpdir): + datasets: List[Dataset] = generate_dataset.create_dataset_collections( + TestSQLServer.EXPECTED_COLLECTION + ) + set_field_data_categories(datasets, "system.operations") + + file_name = tmpdir.join("dataset.yml") + write_manifest(file_name, [i.dict() for i in datasets], "dataset") + + create_server_datasets(test_config, datasets) + generate_dataset.database_coverage( + connection_string=MSSQL_URL, + manifest_dir=f"{tmpdir}", + coverage_threshold=100, + url=test_config.cli.server_url, + headers=test_config.user.request_headers, + )