From 861201b96ceca626ca47de6bfb31e6f03620f28c Mon Sep 17 00:00:00 2001 From: Facundo Lopez Janza <56484504+Linker44@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:08:35 -0300 Subject: [PATCH] Added masking_strategy_override at field level (#5446) Co-authored-by: Adrian Galvan --- CHANGELOG.md | 1 + .../dataset/bigquery_example_test_dataset.yml | 21 +- ...le_field_masking_override_test_dataset.yml | 252 +++++++++++++++++ ...alid_masking_strategy_override_dataset.yml | 43 +++ .../dataset/postgres_example_test_dataset.yml | 6 + requirements.txt | 2 +- .../api/api/v1/endpoints/dataset_endpoints.py | 2 + .../api/api/v1/endpoints/router_factory.py | 16 +- src/fides/api/db/seed.py | 46 +--- src/fides/api/graph/config.py | 10 +- src/fides/api/models/datasetconfig.py | 37 +++ .../api/schemas/saas/connector_template.py | 2 + .../api/service/connectors/query_config.py | 52 ++-- src/fides/api/util/data_category.py | 46 ++++ src/fides/core/api.py | 7 +- tests/ctl/api/test_seed.py | 13 +- tests/ctl/conftest.py | 15 +- tests/fixtures/application_fixtures.py | 120 ++++++++- .../v1/endpoints/test_dataset_endpoints.py | 255 ++++++++++++++++++ tests/ops/graph/test_config.py | 12 + .../service/connectors/test_queryconfig.py | 52 +++- .../test_request_runner_service.py | 80 ++++-- tests/ops/task/test_create_request_tasks.py | 15 ++ 23 files changed, 993 insertions(+), 112 deletions(-) create mode 100644 data/dataset/example_field_masking_override_test_dataset.yml create mode 100644 data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index d133eeeae8..64652fff67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The types of changes are: ### Added - Added namespace support for Snowflake [#5486](https://github.com/ethyca/fides/pull/5486) +- Added support for field-level masking overrides [#5446](https://github.com/ethyca/fides/pull/5446) ### Developer Experience - Migrated several instances of Chakra's Select component to use Ant's Select component [#5475](https://github.com/ethyca/fides/pull/5475) diff --git a/data/dataset/bigquery_example_test_dataset.yml b/data/dataset/bigquery_example_test_dataset.yml index 0420f04f55..11fdac1aba 100644 --- a/data/dataset/bigquery_example_test_dataset.yml +++ b/data/dataset/bigquery_example_test_dataset.yml @@ -5,7 +5,7 @@ dataset: collections: - name: address fides_meta: - erase_after: [ bigquery_example_test_dataset.employee ] + erase_after: [bigquery_example_test_dataset.employee] fields: - name: city data_categories: [user.contact.address.city] @@ -19,12 +19,18 @@ dataset: data_categories: [user.contact.address.state] - name: street data_categories: [user.contact.address.street] + fides_meta: + data_type: string + masking_strategy_override: + strategy: string_rewrite + configuration: + rewrite_value: REDACTED - name: zip data_categories: [user.contact.address.postal_code] - name: customer fides_meta: - erase_after: [ bigquery_example_test_dataset.address ] + erase_after: [bigquery_example_test_dataset.address] fields: - name: address_id data_categories: [system.operations] @@ -238,11 +244,12 @@ dataset: - name: visit_partitioned fides_meta: partitioning: - where_clauses: [ - "`last_visit` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 500 DAY) AND `last_visit` <= CURRENT_TIMESTAMP()", - "`last_visit` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `last_visit` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 500 DAY)", - "`last_visit` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)", - ] + where_clauses: + [ + "`last_visit` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 500 DAY) AND `last_visit` <= CURRENT_TIMESTAMP()", + "`last_visit` > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY) AND `last_visit` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 500 DAY)", + "`last_visit` <= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1000 DAY)", + ] fields: - name: email data_categories: [user.contact.email] diff --git a/data/dataset/example_field_masking_override_test_dataset.yml b/data/dataset/example_field_masking_override_test_dataset.yml new file mode 100644 index 0000000000..24bdf84555 --- /dev/null +++ b/data/dataset/example_field_masking_override_test_dataset.yml @@ -0,0 +1,252 @@ +dataset: + - fides_key: field_masking_override_test_dataset + name: Field Masking Override Test Dataset + description: Example of a dataset containing masking strategy override at the field-level. + collections: + - name: address + fields: + - name: city + data_categories: [user.contact.address.city] + - name: house + data_categories: [user.contact.address.street] + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: state + data_categories: [user.contact.address.state] + - name: street + data_categories: [user.contact.address.street] + - name: zip + data_categories: [user.contact.address.postal_code] + + - name: customer + fields: + - name: address_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: address.id + direction: to + - name: created + data_categories: [system.operations] + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: id + data_categories: [user.unique_id] + fides_meta: + primary_key: True + - name: name + data_categories: [user.name] + fides_meta: + data_type: string + length: 40 + masking_strategy_override: + strategy: random_string_rewrite + configuration: + length: 5 + format_preservation: + suffix: "@example.com" + - name: address + fields: + - name: city + data_categories: [user.contact.address.city] + - name: house + data_categories: [user.contact.address.street] + fides_meta: + data_type: string + masking_strategy_override: + strategy: string_rewrite + configuration: + rewrite_value: "1234" + format_preservation: + suffix: "-test" + - name: state + data_categories: [user.contact.address.state] + masking_strategy_override: + strategy: null_rewrite + - name: street + data_categories: [user.contact.address.street] + - name: zip + data_categories: [user.contact.address.postal_code] + + - name: employee + fields: + - name: address_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: address.id + direction: to + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: id + data_categories: [user.unique_id] + fides_meta: + primary_key: True + - name: name + data_categories: [user.name] + fides_meta: + data_type: string + + - name: login + fields: + - name: customer_id + data_categories: [user.unique_id] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: customer.id + direction: from + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: time + data_categories: [user.sensor] + + - name: orders + fields: + - name: customer_id + data_categories: [user.unique_id] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: customer.id + direction: from + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: shipping_address_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: address.id + direction: to + + # order_item + - name: order_item + fields: + - name: order_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: orders.id + direction: from + - name: product_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: product.id + direction: to + - name: quantity + data_categories: [system.operations] + + - name: payment_card + fields: + - name: billing_address_id + data_categories: [system.operations] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: address.id + direction: to + - name: ccn + data_categories: [user.financial.bank_account] + - name: code + data_categories: [user.financial] + - name: customer_id + data_categories: [user.unique_id] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: customer.id + direction: from + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: name + data_categories: [user.financial] + - name: preferred + data_categories: [user] + + - name: product + fields: + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: name + data_categories: [system.operations] + - name: price + data_categories: [system.operations] + + - name: report + fields: + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: month + data_categories: [system.operations] + - name: name + data_categories: [system.operations] + - name: total_visits + data_categories: [system.operations] + - name: year + data_categories: [system.operations] + + - name: service_request + fields: + - name: alt_email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: closed + data_categories: [system.operations] + - name: email + data_categories: [system.operations] + fides_meta: + identity: email + data_type: string + - name: employee_id + data_categories: [user.unique_id] + fides_meta: + references: + - dataset: field_masking_override_test_dataset + field: employee.id + direction: from + - name: id + data_categories: [system.operations] + fides_meta: + primary_key: True + - name: opened + data_categories: [system.operations] + - name: visit + fields: + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: last_visit + data_categories: [system.operations] diff --git a/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml new file mode 100644 index 0000000000..5195a3671a --- /dev/null +++ b/data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml @@ -0,0 +1,43 @@ +dataset: + - fides_key: postgres_example_invalid_masking_strategy_override + name: Postgres Example Invalid Masking Strategy Override Test Dataset + description: Example of a Postgres dataset containing an invalid masking startegy override + collections: + - name: customer + fields: + - name: created + data_categories: [system.operations] + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: id + data_categories: [user.unique_id] + fides_meta: + primary_key: True + - name: name + data_categories: [user.name] + fides_meta: + data_type: string + length: 40 + + - name: employee + fields: + - name: email + data_categories: [user.contact.email] + fides_meta: + identity: email + data_type: string + - name: id + data_categories: [user.unique_id] + fides_meta: + primary_key: True + - name: name + data_categories: [user.name] + fides_meta: + data_type: string + masking_strategy_override: + strategy: hash + configuration: + algorithm: SHA-256 diff --git a/data/dataset/postgres_example_test_dataset.yml b/data/dataset/postgres_example_test_dataset.yml index 5b9ddc2658..d62eb38d46 100644 --- a/data/dataset/postgres_example_test_dataset.yml +++ b/data/dataset/postgres_example_test_dataset.yml @@ -68,6 +68,12 @@ dataset: data_categories: [user.name] fides_meta: data_type: string + masking_strategy_override: + strategy: string_rewrite + configuration: + rewrite_value: testing + format_preservation: + suffix: "-test" - name: login fields: diff --git a/requirements.txt b/requirements.txt index 6943e933c5..c7fe1cbe4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,4 +71,4 @@ twilio==7.15.0 typing-extensions==4.12.2 validators==0.20.0 versioneer==0.19 -fideslang==3.0.8 +fideslang==3.0.9 diff --git a/src/fides/api/api/v1/endpoints/dataset_endpoints.py b/src/fides/api/api/v1/endpoints/dataset_endpoints.py index 08677b8b2c..5223e4d6d9 100644 --- a/src/fides/api/api/v1/endpoints/dataset_endpoints.py +++ b/src/fides/api/api/v1/endpoints/dataset_endpoints.py @@ -39,6 +39,7 @@ DatasetConfig, convert_dataset_to_graph, to_graph_field, + validate_masking_strategy_override, ) from fides.api.oauth.utils import verify_oauth_client from fides.api.schemas.api import BulkUpdateFailed @@ -417,6 +418,7 @@ def create_or_update_dataset( # when a ctl_dataset is being linked to a Saas Connector. _validate_saas_dataset(connection_config, dataset) # type: ignore # Try to find an existing DatasetConfig matching the given connection & key + validate_masking_strategy_override(dataset) dataset_config = create_method(db, data=data) created_or_updated.append(dataset_config.ctl_dataset) except ( diff --git a/src/fides/api/api/v1/endpoints/router_factory.py b/src/fides/api/api/v1/endpoints/router_factory.py index 1a5e34ce29..b7be7047e7 100644 --- a/src/fides/api/api/v1/endpoints/router_factory.py +++ b/src/fides/api/api/v1/endpoints/router_factory.py @@ -15,6 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from starlette.status import HTTP_422_UNPROCESSABLE_ENTITY +from fides.api.common_exceptions import ValidationError from fides.api.db.crud import ( create_resource, delete_resource, @@ -24,6 +25,7 @@ upsert_resources, ) from fides.api.db.ctl_session import get_async_db +from fides.api.models.datasetconfig import validate_masking_strategy_override from fides.api.models.sql_models import ( DataCategory, ModelWithDefaultField, @@ -68,6 +70,16 @@ async def validate_data_categories( ) +def validate_masking_strategy(dataset: Dataset) -> None: + try: + validate_masking_strategy_override(dataset) + except ValidationError as e: + raise HTTPException( + status_code=HTTP_422_UNPROCESSABLE_ENTITY, + detail=jsonable_encoder(e.message), + ) + + def generic_router_factory(fides_model: FidesModelType, model_type: str) -> APIRouter: """ Compose all of the individual route factories into a single coherent Router. @@ -145,6 +157,7 @@ async def create( sql_model = sql_model_map[model_type] if isinstance(resource, Dataset): await validate_data_categories(resource, db) + validate_masking_strategy(resource) if isinstance(sql_model, ModelWithDefaultField) and resource.is_default: raise errors.ForbiddenIsDefaultTaxonomyError( model_type, resource.fides_key, action="create" @@ -249,6 +262,7 @@ async def update( sql_model = sql_model_map[model_type] if isinstance(resource, Dataset): await validate_data_categories(resource, db) + validate_masking_strategy(resource) await forbid_if_editing_is_default(sql_model, resource.fides_key, resource, db) return await update_resource(sql_model, resource.model_dump(mode="json"), db) @@ -330,7 +344,7 @@ async def upsert( for resource in resources: if isinstance(resource, Dataset): await validate_data_categories(resource, db) - + validate_masking_strategy(resource) await forbid_if_editing_any_is_default(sql_model, resource_dicts, db) result = await upsert_resources(sql_model, resource_dicts, db) response.status_code = ( diff --git a/src/fides/api/db/seed.py b/src/fides/api/db/seed.py index d549521016..90e6c7dfff 100644 --- a/src/fides/api/db/seed.py +++ b/src/fides/api/db/seed.py @@ -38,6 +38,7 @@ from fides.api.schemas.dataset import DatasetConfigCtlDataset from fides.api.schemas.policy import ActionType, DrpAction from fides.api.util.connection_util import patch_connection_configs +from fides.api.util.data_category import get_user_data_categories from fides.api.util.errors import AlreadyExistsError, QueryError from fides.api.util.text import to_snake_case from fides.config import CONFIG @@ -114,36 +115,6 @@ def create_or_update_parent_user() -> None: ) -def filter_data_categories( - categories: List[str], excluded_categories: List[str] -) -> List[str]: - """ - Filter data categories and their children out of a list of categories. - - We only want user-related data categories, but not the parent category - We also only want 2nd level categories, otherwise there are policy conflicts - """ - user_categories = [ - category - for category in categories - if category.startswith("user.") and len(category.split(".")) < 3 - ] - if excluded_categories: - duplicated_categories = [ - category - for excluded_category in excluded_categories - for category in user_categories - if not category.startswith(excluded_category) - ] - default_categories = { - category - for category in duplicated_categories - if duplicated_categories.count(category) == len(excluded_categories) - } - return sorted(list(default_categories)) - return sorted(user_categories) - - def get_client_id(db_session: Session) -> str: client = ClientDetail.get_by( db=db_session, @@ -325,18 +296,9 @@ def load_default_dsr_policies() -> None: # organizations need to be extra careful about how these are used - # especially for erasure! Therefore, a safe default for "out of the # box" behaviour is to exclude these - excluded_data_categories = [ - "user.financial", - "user.payment", - "user.authorization", - ] - all_data_categories = [ - str(category.fides_key) - for category in DEFAULT_TAXONOMY.data_category # pylint:disable=not-an-iterable - ] - default_data_categories = filter_data_categories( - all_data_categories, excluded_data_categories - ) + + default_data_categories = get_user_data_categories() + log.debug( f"Preparing to create default rules for the following Data Categories: {default_data_categories} if they do not already exist" ) diff --git a/src/fides/api/graph/config.py b/src/fides/api/graph/config.py index 9366f9ee04..19ffab4102 100644 --- a/src/fides/api/graph/config.py +++ b/src/fides/api/graph/config.py @@ -85,7 +85,7 @@ from re import match, search from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, Union -from fideslang.models import MaskingStrategyOverride +from fideslang.models import FieldMaskingStrategyOverride, MaskingStrategyOverride from fideslang.validation import FidesKey from pydantic import BaseModel, ConfigDict, field_serializer, field_validator @@ -262,8 +262,8 @@ class Field(BaseModel, ABC): data_categories: Optional[List[FidesKey]] = None data_type_converter: DataTypeConverter = DataType.no_op.value return_all_elements: Optional[bool] = None + masking_strategy_override: Optional[FieldMaskingStrategyOverride] = None # Should field be returned by query if it is in an entrypoint array field, or just if it matches query? - custom_request_field: Optional[str] = None """Known type of held data""" @@ -393,6 +393,7 @@ def generate_field( return_all_elements: Optional[bool], read_only: Optional[bool], custom_request_field: Optional[str], + masking_strategy_override: Optional[FieldMaskingStrategyOverride], ) -> Field: """Generate a graph field.""" @@ -417,12 +418,13 @@ def generate_field( return_all_elements=return_all_elements, read_only=read_only, custom_request_field=custom_request_field, + masking_strategy_override=masking_strategy_override, ) @dataclass -class MaskingOverride: - """Data class to store override params related to data masking""" +class MaskingTruncation: + """Data class to store truncation params related to data masking""" data_type_converter: Optional[DataTypeConverter] length: Optional[int] diff --git a/src/fides/api/models/datasetconfig.py b/src/fides/api/models/datasetconfig.py index 6ecc311dad..3e8ebeff9f 100644 --- a/src/fides/api/models/datasetconfig.py +++ b/src/fides/api/models/datasetconfig.py @@ -19,6 +19,7 @@ ) from fides.api.graph.data_type import parse_data_type_string from fides.api.models.connectionconfig import ConnectionConfig, ConnectionType +from fides.api.service.masking.strategy.masking_strategy import MaskingStrategy from fides.api.util.saas_util import merge_datasets from fides.api.models.sql_models import ( # type: ignore[attr-defined] # isort: skip @@ -72,6 +73,7 @@ def upsert_ctl_dataset(ctl_dataset_obj: Optional[CtlDataset]) -> CtlDataset: """ ctl_dataset_data = data.copy() validated_data = Dataset(**ctl_dataset_data.get("dataset", {})) + if ctl_dataset_obj: # It's possible this updates the ctl_dataset.fides_key and this causes a conflict # with another ctl_dataset, if we fetched the datasetconfig.ctl_dataset. @@ -205,6 +207,7 @@ def to_graph_field( data_type_name = None read_only = None custom_request_field = None + masking_strategy_override = None if meta_section: identity = meta_section.identity @@ -252,6 +255,9 @@ def to_graph_field( # here in case we decide to allow it in the future. length = meta_section.length + if meta_section.masking_strategy_override: + masking_strategy_override = meta_section.masking_strategy_override + (data_type_name, is_array) = parse_data_type_string(meta_section.data_type) if meta_section.return_all_elements: @@ -277,6 +283,7 @@ def to_graph_field( return_all_elements=return_all_elements, read_only=read_only, custom_request_field=custom_request_field, + masking_strategy_override=masking_strategy_override, ) @@ -403,3 +410,33 @@ def validate_dataset_reference( raise ValidationError( f"Unknown field '{dataset_reference.field}' in dataset '{dataset_config.fides_key}' referenced by external reference" ) + + +def validate_masking_strategy_override(dataset: Dataset) -> None: + """ + Validates that field-level masking overrides do not require secret keys. + When handling a privacy request, we use the `cache_data` function to review the policies and identify which masking strategies need secret keys generated and cached. + Currently, we are avoiding the additional complexity of scanning datasets for masking overrides. + """ + + def validate_field(dataset_field: DatasetField) -> None: + if dataset_field.fields: + for subfield in dataset_field.fields: + validate_field(subfield) + else: + if ( + dataset_field.fides_meta + and dataset_field.fides_meta.masking_strategy_override + ): + strategy: MaskingStrategy = MaskingStrategy.get_strategy( + dataset_field.fides_meta.masking_strategy_override.strategy, + dataset_field.fides_meta.masking_strategy_override.configuration, # type: ignore[arg-type] + ) + if strategy.secrets_required(): + raise ValidationError( + f"Masking strategy '{strategy.name}' with required secrets not allowed as an override." + ) + + for collection in dataset.collections: + for field in collection.fields: + validate_field(field) diff --git a/src/fides/api/schemas/saas/connector_template.py b/src/fides/api/schemas/saas/connector_template.py index 1258127ec2..dbc1ace845 100644 --- a/src/fides/api/schemas/saas/connector_template.py +++ b/src/fides/api/schemas/saas/connector_template.py @@ -3,6 +3,7 @@ from fideslang.models import Dataset from pydantic import BaseModel, field_validator +from fides.api.models.datasetconfig import validate_masking_strategy_override from fides.api.schemas.policy import ActionType from fides.api.schemas.saas.saas_config import SaaSConfig from fides.api.util.saas_util import load_config_from_string, load_dataset_from_string @@ -38,6 +39,7 @@ def validate_config(cls, value: str) -> str: def validate_dataset(cls, dataset: str) -> str: """Validates the dataset at the given path""" saas_dataset = Dataset(**load_dataset_from_string(dataset)) + validate_masking_strategy_override(saas_dataset) if saas_dataset.fides_key != "": raise ValueError( "Hard-coded fides_key detected in the dataset, replace all instances of it with " diff --git a/src/fides/api/service/connectors/query_config.py b/src/fides/api/service/connectors/query_config.py index bd30736c40..be95ebcaed 100644 --- a/src/fides/api/service/connectors/query_config.py +++ b/src/fides/api/service/connectors/query_config.py @@ -19,7 +19,7 @@ CollectionAddress, Field, FieldPath, - MaskingOverride, + MaskingTruncation, ) from fides.api.graph.execution import ExecutionNode from fides.api.models.policy import Policy, Rule @@ -165,26 +165,32 @@ def update_value_map( # pylint: disable=R0914 strategy_config = rule.masking_strategy if not strategy_config: continue - strategy: MaskingStrategy = MaskingStrategy.get_strategy( - strategy_config["strategy"], strategy_config["configuration"] - ) for rule_field_path in field_paths: - masking_override: MaskingOverride = [ - MaskingOverride(field.data_type_converter, field.length) + strategy: MaskingStrategy = MaskingStrategy.get_strategy( + strategy_config["strategy"], strategy_config["configuration"] + ) + truncation: MaskingTruncation = [ + MaskingTruncation(field.data_type_converter, field.length) for field_path, field in self.field_map().items() if field_path == rule_field_path ][0] - null_masking: bool = ( - strategy_config.get("strategy") == NullMaskingStrategy.name - ) - if not self._supported_data_type( - masking_override, null_masking, strategy - ): + field = self.field_map().get(rule_field_path) + if field and field.masking_strategy_override: + masking_strategy_override = field.masking_strategy_override + strategy = MaskingStrategy.get_strategy( + masking_strategy_override.strategy, + masking_strategy_override.configuration, # type: ignore[arg-type] + ) + logger.warning( + f"Using field-level masking override of type '{strategy.name}' for {rule_field_path.string_path}" + ) + null_masking: bool = strategy.name == NullMaskingStrategy.name + if not self._supported_data_type(truncation, null_masking, strategy): logger.warning( "Unable to generate a query for field {}: data_type is either not present on the field or not supported for the {} masking strategy. Received data type: {}", rule_field_path.string_path, - strategy_config["strategy"], - masking_override.data_type_converter.name, # type: ignore + strategy.name, + truncation.data_type_converter.name, # type: ignore ) continue @@ -199,7 +205,7 @@ def update_value_map( # pylint: disable=R0914 request_id=request.id, strategy=strategy, val=pydash.objects.get(row, detailed_path), - masking_override=masking_override, + masking_truncation=truncation, null_masking=null_masking, str_field_path=detailed_path, ) @@ -207,15 +213,17 @@ def update_value_map( # pylint: disable=R0914 @staticmethod def _supported_data_type( - masking_override: MaskingOverride, null_masking: bool, strategy: MaskingStrategy + masking_truncation: MaskingTruncation, + null_masking: bool, + strategy: MaskingStrategy, ) -> bool: """Helper method to determine whether given data_type exists and is supported by the masking strategy""" if null_masking: return True - if not masking_override.data_type_converter: + if not masking_truncation.data_type_converter: return False if not strategy.data_type_supported( - data_type=masking_override.data_type_converter.name + data_type=masking_truncation.data_type_converter.name ): return False return True @@ -225,7 +233,7 @@ def _generate_masked_value( # pylint: disable=R0913 request_id: str, strategy: MaskingStrategy, val: Any, - masking_override: MaskingOverride, + masking_truncation: MaskingTruncation, null_masking: bool, str_field_path: str, ) -> T: @@ -242,14 +250,14 @@ def _generate_masked_value( # pylint: disable=R0913 if null_masking: return masked_val - if masking_override.length: + if masking_truncation.length: logger.warning( "Because a length has been specified for field {}, we will truncate length of masked value to match, regardless of masking strategy", str_field_path, ) # for strategies other than null masking we assume that masked data type is the same as specified data type - masked_val = masking_override.data_type_converter.truncate( # type: ignore - masking_override.length, masked_val + masked_val = masking_truncation.data_type_converter.truncate( # type: ignore + masking_truncation.length, masked_val ) return masked_val diff --git a/src/fides/api/util/data_category.py b/src/fides/api/util/data_category.py index fa8f5b4414..71c24cf366 100644 --- a/src/fides/api/util/data_category.py +++ b/src/fides/api/util/data_category.py @@ -43,3 +43,49 @@ def _validate_data_category( f"The data category '{data_category}' was not found in the database, and is therefore not valid for use here." ) return data_category + + +def get_user_data_categories() -> List[str]: + # organizations need to be extra careful about how these are used - + # especially for erasure! Therefore, a safe default for "out of the + # box" behaviour is to exclude these + excluded_data_categories = [ + "user.financial", + "user.payment", + "user.authorization", + ] + all_data_categories = [ + str(category.fides_key) + for category in DEFAULT_TAXONOMY.data_category # pylint:disable=not-an-iterable + ] + return filter_data_categories(all_data_categories, excluded_data_categories) + + +def filter_data_categories( + categories: List[str], excluded_categories: List[str] +) -> List[str]: + """ + Filter data categories and their children out of a list of categories. + + We only want user-related data categories, but not the parent category + We also only want 2nd level categories, otherwise there are policy conflicts + """ + user_categories = [ + category + for category in categories + if category.startswith("user.") and len(category.split(".")) < 3 + ] + if excluded_categories: + duplicated_categories = [ + category + for excluded_category in excluded_categories + for category in user_categories + if not category.startswith(excluded_category) + ] + default_categories = { + category + for category in duplicated_categories + if duplicated_categories.count(category) == len(excluded_categories) + } + return sorted(list(default_categories)) + return sorted(user_categories) diff --git a/src/fides/core/api.py b/src/fides/core/api.py index 9ca53e2a45..c6635595e4 100644 --- a/src/fides/core/api.py +++ b/src/fides/core/api.py @@ -145,4 +145,9 @@ def db_action( """ Tell the API to perform a database action. """ - return requests.post(f"{server_url}{API_PREFIX}/admin/db/{action}", headers=headers) + return requests.post( + f"{server_url}{API_PREFIX}/admin/db/{action}", + headers=headers, + allow_redirects=False, + timeout=30, + ) diff --git a/tests/ctl/api/test_seed.py b/tests/ctl/api/test_seed.py index 6333c2c530..454607ea1b 100644 --- a/tests/ctl/api/test_seed.py +++ b/tests/ctl/api/test_seed.py @@ -16,6 +16,7 @@ from fides.api.models.fides_user import FidesUser from fides.api.models.policy import ActionType, DrpAction, Policy, Rule, RuleTarget from fides.api.models.sql_models import Dataset, PolicyCtl, System +from fides.api.util.data_category import filter_data_categories from fides.config import CONFIG, FidesConfig from fides.core import api as _api @@ -104,7 +105,7 @@ def test_filter_data_categories_excluded(self) -> None: "user.name", "user.test", ] - assert seed.filter_data_categories( + assert filter_data_categories( all_data_categories, excluded_data_categories ) == sorted(expected_result) @@ -130,7 +131,7 @@ def test_filter_data_categories_no_third_level(self) -> None: "user.name", "user.test", ] - assert seed.filter_data_categories( + assert filter_data_categories( all_data_categories, excluded_data_categories ) == sorted(expected_result) @@ -145,7 +146,7 @@ def test_filter_data_categories_no_top_level(self) -> None: "user.name", "user.test", ] - assert seed.filter_data_categories(all_data_categories, []) == expected_result + assert filter_data_categories(all_data_categories, []) == expected_result def test_filter_data_categories_empty_excluded(self) -> None: """Test that the filter method works as intended""" @@ -155,7 +156,7 @@ def test_filter_data_categories_empty_excluded(self) -> None: "user.authorization", "user.financial", ] - assert seed.filter_data_categories(all_data_categories, []) == sorted( + assert filter_data_categories(all_data_categories, []) == sorted( all_data_categories ) @@ -167,7 +168,7 @@ def test_filter_data_categories_no_exclusions(self) -> None: "user.authorization", "user.financial", ] - assert seed.filter_data_categories( + assert filter_data_categories( all_data_categories, excluded_data_categories ) == sorted(all_data_categories) @@ -186,7 +187,7 @@ def test_filter_data_categories_only_return_users(self) -> None: "user.authorization", "user.financial", ] - assert seed.filter_data_categories(all_data_categories, []) == sorted( + assert filter_data_categories(all_data_categories, []) == sorted( expected_categories ) diff --git a/tests/ctl/conftest.py b/tests/ctl/conftest.py index db9f33e2ef..a4408feb45 100644 --- a/tests/ctl/conftest.py +++ b/tests/ctl/conftest.py @@ -45,18 +45,11 @@ def monkeypatch_requests(test_client, monkeysession) -> None: @pytest.fixture(scope="session", autouse=True) -@pytest.mark.usefixtures("monkeypatch_requests") -def setup_ctl_db(test_config, test_client, config): - "Sets up the database for testing." +def setup_db(api_client, config): + """Apply migrations at beginning and end of testing session""" assert config.test_mode - assert ( - requests.post == test_client.post - ) # Sanity check to make sure monkeypatch_requests fixture has run - yield api.db_action( - server_url=test_config.cli.server_url, - headers=config.user.auth_header, - action="reset", - ) + assert requests.post != api_client.post + yield api_client.post(url=f"{config.cli.server_url}/v1/admin/db/reset") @pytest.fixture(scope="session") diff --git a/tests/fixtures/application_fixtures.py b/tests/fixtures/application_fixtures.py index a21b25caaf..175eb8f578 100644 --- a/tests/fixtures/application_fixtures.py +++ b/tests/fixtures/application_fixtures.py @@ -111,7 +111,7 @@ from fides.api.service.masking.strategy.masking_strategy_string_rewrite import ( StringRewriteMaskingStrategy, ) -from fides.api.util.data_category import DataCategory +from fides.api.util.data_category import DataCategory, get_user_data_categories from fides.config import CONFIG from fides.config.helpers import load_file from tests.ops.integration_tests.saas.connector_runner import ( @@ -879,6 +879,66 @@ def erasure_policy( pass +@pytest.fixture(scope="function") +def biquery_erasure_policy( + db: Session, + oauth_client: ClientDetail, +) -> Generator: + erasure_policy = Policy.create( + db=db, + data={ + "name": "example erasure policy", + "key": "example_erasure_policy", + "client_id": oauth_client.id, + }, + ) + + erasure_rule = Rule.create( + db=db, + data={ + "action_type": ActionType.erasure.value, + "client_id": oauth_client.id, + "name": "Erasure Rule", + "policy_id": erasure_policy.id, + "masking_strategy": { + "strategy": "null_rewrite", + "configuration": {}, + }, + }, + ) + + user_name_target = RuleTarget.create( + db=db, + data={ + "client_id": oauth_client.id, + "data_category": DataCategory("user.name").value, + "rule_id": erasure_rule.id, + }, + ) + street_address_target = RuleTarget.create( + db=db, + data={ + "client_id": oauth_client.id, + "data_category": DataCategory("user.contact.address.street").value, + "rule_id": erasure_rule.id, + }, + ) + yield erasure_policy + try: + user_name_target.delete(db) + street_address_target.delete(db) + except ObjectDeletedError: + pass + try: + erasure_rule.delete(db) + except ObjectDeletedError: + pass + try: + erasure_policy.delete(db) + except ObjectDeletedError: + pass + + @pytest.fixture(scope="function") def erasure_policy_aes( db: Session, @@ -1030,6 +1090,64 @@ def erasure_policy_two_rules( pass +@pytest.fixture(scope="function") +def erasure_policy_all_categories( + db: Session, + oauth_client: ClientDetail, +) -> Generator: + erasure_policy = Policy.create( + db=db, + data={ + "name": "example erasure policy", + "key": "example_erasure_policy", + "client_id": oauth_client.id, + }, + ) + + erasure_rule = Rule.create( + db=db, + data={ + "action_type": ActionType.erasure.value, + "client_id": oauth_client.id, + "name": "Erasure Rule", + "policy_id": erasure_policy.id, + "masking_strategy": { + "strategy": "null_rewrite", + "configuration": {}, + }, + }, + ) + + filtered_categories = get_user_data_categories() + rule_targets = [] + + for category in filtered_categories: + rule_targets.append( + RuleTarget.create( + db=db, + data={ + "client_id": oauth_client.id, + "data_category": category, + "rule_id": erasure_rule.id, + }, + ) + ) + yield erasure_policy + try: + for rule_target in rule_targets: + rule_target.delete(db) + except ObjectDeletedError: + pass + try: + erasure_rule.delete(db) + except ObjectDeletedError: + pass + try: + erasure_policy.delete(db) + except ObjectDeletedError: + pass + + @pytest.fixture(scope="function") def empty_policy( db: Session, diff --git a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py index 6bfc3463cb..8d9ad6fc2f 100644 --- a/tests/ops/api/v1/endpoints/test_dataset_endpoints.py +++ b/tests/ops/api/v1/endpoints/test_dataset_endpoints.py @@ -34,6 +34,7 @@ V1_URL_PREFIX, YAML_DATASETS, ) +from tests.fixtures.application_fixtures import load_dataset def _reject_key(dict: Dict, key: str) -> Dict: @@ -1051,6 +1052,50 @@ def test_put_create_dataset_configs_add_and_remove( db.refresh(connection_config) assert len(connection_config.datasets) == 1 + def test_put_create_dataset_configs_invalid_field_masking_strategy_override( + self, + db, + generate_auth_header, + api_client, + datasets_url, + connection_config: ConnectionConfig, + ): + # create ctl_datasets + example_dataset = load_dataset( + "data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml" + ) + field_masking_override_dataset = CtlDataset( + **example_dataset[0], organization_fides_key="default_organization" + ) + db.add(field_masking_override_dataset) + db.commit() + + # add the first dataset to the connection + auth_header = generate_auth_header(scopes=[DATASET_CREATE_OR_UPDATE]) + response = api_client.put( + datasets_url, + headers=auth_header, + json=[ + { + "fides_key": field_masking_override_dataset.fides_key, + "ctl_dataset_fides_key": field_masking_override_dataset.fides_key, + } + ], + ) + assert response.status_code == 200 + response_body = json.loads(response.text) + assert len(response_body["succeeded"]) == 0 + assert len(response_body["failed"]) == 1 + field_masking_override_dataset = response_body["failed"][0] + assert ( + field_masking_override_dataset["data"]["fides_key"] + == "postgres_example_invalid_masking_strategy_override" + ) + assert ( + field_masking_override_dataset["message"] + == "Masking strategy 'hash' with required secrets not allowed as an override." + ) + class TestPutDatasets: @pytest.fixture @@ -1232,6 +1277,216 @@ def test_patch_datasets_bulk_create( scylladb_config.delete(db) scylladb_ctl_dataset.delete(db) + def test_patch_datasets_field_masking_strategy_override_create( + self, + datasets_url, + api_client: TestClient, + db: Session, + generate_auth_header, + connection_config, + ) -> None: + auth_header = generate_auth_header(scopes=[DATASET_CREATE_OR_UPDATE]) + example_dataset = load_dataset( + "data/dataset/example_field_masking_override_test_dataset.yml" + ) + response = api_client.patch( + datasets_url, headers=auth_header, json=example_dataset + ) + assert response.status_code == 200 + response_body = json.loads(response.text) + assert len(response_body["succeeded"]) == 1 + assert len(response_body["failed"]) == 0 + + field_masking_override_dataset = response_body["succeeded"][0] + field_masking_override_config = DatasetConfig.get_by( + db=db, field="fides_key", value="field_masking_override_test_dataset" + ) + assert field_masking_override_config is not None + field_masking_override_ctl_dataset = field_masking_override_config.ctl_dataset + assert field_masking_override_ctl_dataset is not None + assert ( + field_masking_override_dataset["fides_key"] + == "field_masking_override_test_dataset" + ) + assert ( + field_masking_override_dataset["name"] + == "Field Masking Override Test Dataset" + ) + assert ( + "Example of a dataset containing masking strategy override at the field-level." + in field_masking_override_dataset["description"] + ) + assert len(field_masking_override_dataset["collections"]) == 11 + assert len(field_masking_override_ctl_dataset.collections) == 11 + + field_masking_override_config.delete(db) + field_masking_override_ctl_dataset.delete(db) + + def test_patch_datasets_invalid_field_masking_strategy_override_create( + self, + datasets_url, + api_client: TestClient, + db: Session, + generate_auth_header, + connection_config, + ) -> None: + auth_header = generate_auth_header(scopes=[DATASET_CREATE_OR_UPDATE]) + example_dataset = load_dataset( + "data/dataset/postgres_example_invalid_masking_strategy_override_dataset.yml" + ) + response = api_client.patch( + datasets_url, headers=auth_header, json=example_dataset + ) + assert response.status_code == 200 + response_body = json.loads(response.text) + assert len(response_body["succeeded"]) == 0 + assert len(response_body["failed"]) == 1 + + # Confirm that postgres dataset matches the values we provided + field_masking_override_dataset = response_body["failed"][0] + assert ( + field_masking_override_dataset["data"]["fides_key"] + == "postgres_example_invalid_masking_strategy_override" + ) + assert ( + field_masking_override_dataset["message"] + == "Masking strategy 'hash' with required secrets not allowed as an override." + ) + field_masking_override_config = DatasetConfig.get_by( + db=db, + field="fides_key", + value="postgres_example_invalid_masking_strategy_override", + ) + assert field_masking_override_config is None + + def test_patch_datasets_field_masking_strategy_override_update( + self, + datasets_url, + api_client: TestClient, + db: Session, + generate_auth_header, + ) -> None: + # Create first, then update + auth_header = generate_auth_header(scopes=[DATASET_CREATE_OR_UPDATE]) + example_dataset = load_dataset( + "data/dataset/example_field_masking_override_test_dataset.yml" + ) + api_client.patch(datasets_url, headers=auth_header, json=example_dataset) + + valid_masking_override = { + "strategy": "string_rewrite", + "configuration": {"rewrite_value": "REDACTED"}, + } + + updated_datasets = example_dataset.copy()[0] + # Remove all collections from the dataset example, except for the customer table. + # Note we also need to remove customer.address_id as it references the addresses table + updated_datasets["collections"] = [ + collection + for collection in updated_datasets["collections"] + if collection["name"] == "customer" + ] + updated_datasets["collections"][0]["fields"] = [ + field + for field in updated_datasets["collections"][0]["fields"] + if field["name"] != "address_id" + ] + # Update the masking strategy override for the name field + for idx, field in enumerate(updated_datasets["collections"][0]["fields"]): + if field["name"] == "name": + updated_datasets["collections"][0]["fields"][idx]["fides_meta"][ + "masking_strategy_override" + ] = valid_masking_override + + response = api_client.patch( + datasets_url, headers=auth_header, json=[updated_datasets] + ) + + assert response.status_code == 200 + response_body = json.loads(response.text) + assert len(response_body["succeeded"]) == 1 + assert len(response_body["failed"]) == 0 + + # test postgres + field_masking_override_dataset = response_body["succeeded"][0] + assert ( + field_masking_override_dataset["fides_key"] + == "field_masking_override_test_dataset" + ) + field_masking_override_config = DatasetConfig.get_by( + db=db, field="fides_key", value="field_masking_override_test_dataset" + ) + + field_masking_override_config.delete(db) + postgres_ctl_dataset = field_masking_override_config.ctl_dataset + postgres_ctl_dataset.delete(db) + + def test_patch_datasets_invalid_field_masking_strategy_override_update( + self, + datasets_url, + api_client: TestClient, + db: Session, + generate_auth_header, + ) -> None: + # Create first, then update + auth_header = generate_auth_header(scopes=[DATASET_CREATE_OR_UPDATE]) + example_dataset = load_dataset( + "data/dataset/example_field_masking_override_test_dataset.yml" + ) + api_client.patch(datasets_url, headers=auth_header, json=example_dataset) + + invalid_masking_override = { + "strategy": "hash", + "configuration": {"algorithm": "SHA-256"}, + } + + updated_datasets = example_dataset.copy()[0] + # Remove all collections from the dataset example, except for the customer table. + # Note we also need to remove customer.address_id as it references the addresses table + updated_datasets["collections"] = [ + collection + for collection in updated_datasets["collections"] + if collection["name"] == "customer" + ] + updated_datasets["collections"][0]["fields"] = [ + field + for field in updated_datasets["collections"][0]["fields"] + if field["name"] != "address_id" + ] + # Update the masking strategy override for the name field + for idx, field in enumerate(updated_datasets["collections"][0]["fields"]): + if field["name"] == "name": + updated_datasets["collections"][0]["fields"][idx]["fides_meta"][ + "masking_strategy_override" + ] = invalid_masking_override + + response = api_client.patch( + datasets_url, headers=auth_header, json=[updated_datasets] + ) + + assert response.status_code == 200 + response_body = json.loads(response.text) + assert len(response_body["succeeded"]) == 0 + assert len(response_body["failed"]) == 1 + + # test postgres + field_masking_override_dataset = response_body["failed"][0] + assert ( + field_masking_override_dataset["data"]["fides_key"] + == "field_masking_override_test_dataset" + ) + assert ( + field_masking_override_dataset["message"] + == "Masking strategy 'hash' with required secrets not allowed as an override." + ) + field_masking_override_config = DatasetConfig.get_by( + db=db, field="fides_key", value="field_masking_override_test_dataset" + ) + + field_masking_override_config.delete(db) + postgres_ctl_dataset = field_masking_override_config.ctl_dataset + postgres_ctl_dataset.delete(db) + def test_patch_datasets_bulk_update( self, example_datasets, diff --git a/tests/ops/graph/test_config.py b/tests/ops/graph/test_config.py index 3564676539..40252fa888 100644 --- a/tests/ops/graph/test_config.py +++ b/tests/ops/graph/test_config.py @@ -141,6 +141,7 @@ def test_from_string(self): "is_array": False, "read_only": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "f2", @@ -154,6 +155,7 @@ def test_from_string(self): "is_array": False, "read_only": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "f3", @@ -167,6 +169,7 @@ def test_from_string(self): "is_array": True, "read_only": False, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "f4", @@ -180,6 +183,7 @@ def test_from_string(self): "is_array": False, "read_only": None, "custom_request_field": None, + "masking_strategy_override": None, "fields": { "f5": { "name": "f5", @@ -193,6 +197,7 @@ def test_from_string(self): "is_array": False, "read_only": None, "custom_request_field": None, + "masking_strategy_override": None, } }, }, @@ -500,6 +505,7 @@ def _is_string_field(f: Field): return_all_elements=None, read_only=None, custom_request_field=None, + masking_strategy_override=None, ) array_field = generate_field( name="arr", @@ -514,6 +520,7 @@ def _is_string_field(f: Field): return_all_elements=True, read_only=None, custom_request_field=None, + masking_strategy_override=None, ) object_field = generate_field( name="obj", @@ -528,6 +535,7 @@ def _is_string_field(f: Field): return_all_elements=None, read_only=None, custom_request_field=None, + masking_strategy_override=None, ) object_array_field = generate_field( name="obj_a", @@ -542,6 +550,7 @@ def _is_string_field(f: Field): return_all_elements=None, read_only=None, custom_request_field=None, + masking_strategy_override=None, ) custom_request_field = generate_field( name="custom_field", @@ -556,6 +565,7 @@ def _is_string_field(f: Field): return_all_elements=None, read_only=None, custom_request_field="site_id", + masking_strategy_override=None, ) assert _is_string_field(string_field) @@ -672,6 +682,7 @@ def test_generate_object_field_with_data_categories(self): return_all_elements=None, read_only=False, custom_request_field=None, + masking_strategy_override=None, ) assert field @@ -689,6 +700,7 @@ def test_generate_read_only_scalar_field(self): return_all_elements=None, read_only=True, custom_request_field=None, + masking_strategy_override=None, ) assert isinstance(field, ScalarField) assert field.read_only diff --git a/tests/ops/service/connectors/test_queryconfig.py b/tests/ops/service/connectors/test_queryconfig.py index 52572288f8..d6e591d2d7 100644 --- a/tests/ops/service/connectors/test_queryconfig.py +++ b/tests/ops/service/connectors/test_queryconfig.py @@ -1,5 +1,6 @@ from datetime import datetime, timezone -from typing import Any, Dict, Set +from typing import Any, Dict, Generator, Set +from unittest import mock import pytest from boto3.dynamodb.types import TypeDeserializer @@ -24,11 +25,13 @@ from fides.api.service.connectors.query_config import ( DynamoDBQueryConfig, MongoQueryConfig, + QueryConfig, SQLQueryConfig, ) from fides.api.service.connectors.scylla_query_config import ScyllaDBQueryConfig from fides.api.service.masking.strategy.masking_strategy_hash import HashMaskingStrategy from fides.api.util.data_category import DataCategory +from tests.fixtures.application_fixtures import load_dataset from ...task.traversal_data import combined_mongo_postgresql_graph, integration_db_graph from ...test_helpers.cache_secrets_helper import cache_secret, clear_cache_secrets @@ -57,6 +60,53 @@ privacy_request = PrivacyRequest(id="234544") +@mock.patch.multiple(QueryConfig, __abstractmethods__=set()) +class TestQueryConfig: + + def test_update_value_map_masking_strategy_override( + self, erasure_policy_all_categories, connection_config + ): + example_dataset = load_dataset( + "data/dataset/example_field_masking_override_test_dataset.yml" + ) + dataset = Dataset(**example_dataset[0]) + graph = convert_dataset_to_graph(dataset, connection_config.key) + dataset_graph = DatasetGraph(*[graph]) + traversal = Traversal(dataset_graph, {"email": "customer-1@example.com"}) + + customer_node = traversal.traversal_node_dict[ + CollectionAddress("field_masking_override_test_dataset", "customer") + ].to_mock_execution_node() + + config = QueryConfig(customer_node) + row = { + "email": "customer-1@example.com", + "name": "John Customer", + "address_id": 1, + "id": 1, + "address": { + "city": "San Francisco", + "state": "CA", + "zip": "94105", + "house": "123", + "street": "Main St", + }, + } + updated_value_map = config.update_value_map( + row, erasure_policy_all_categories, privacy_request + ) + + for key, value in updated_value_map.items(): + # override the null rewrite masking strategy for the name field to use random_string_rewrite + if key == "name": + assert value.endswith("@example.com") + # override the null rewrite masking strategy for address.house field to use string_rewrite + elif key == "address.house": + assert value == "1234-test" + else: + assert value is None + + class TestSQLQueryConfig: def test_extract_query_components(self): def found_query_keys(node: ExecutionNode, values: Dict[str, Any]) -> Set[str]: diff --git a/tests/ops/service/privacy_request/test_request_runner_service.py b/tests/ops/service/privacy_request/test_request_runner_service.py index 66f332d417..9a72b8c3e6 100644 --- a/tests/ops/service/privacy_request/test_request_runner_service.py +++ b/tests/ops/service/privacy_request/test_request_runner_service.py @@ -1336,6 +1336,60 @@ def test_create_and_process_erasure_request_specific_category_postgres( assert customer_found +@pytest.mark.integration_postgres +@pytest.mark.integration +@pytest.mark.parametrize( + "dsr_version", + ["use_dsr_3_0", "use_dsr_2_0"], +) +def test_create_and_process_erasure_request_with_masking_strategy_override( + postgres_integration_db, + postgres_example_test_dataset_config, + cache, + db, + generate_auth_header, + erasure_policy, + dsr_version, + request, + read_connection_config, + run_privacy_request_task, +): + request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 + + employee_email = "employee-1@example.com" + employee_id = 1 + data = { + "requested_at": "2021-08-30T16:09:37.359Z", + "policy_key": erasure_policy.key, + "identity": {"email": employee_email}, + } + + stmt = select("*").select_from(table("employee")) + res = postgres_integration_db.execute(stmt).all() + + pr = get_privacy_request_results( + db, + erasure_policy, + run_privacy_request_task, + data, + ) + pr.delete(db=db) + + stmt = select( + column("id"), + column("name"), + ).select_from(table("employee")) + res = postgres_integration_db.execute(stmt).all() + + customer_found = False + for row in res: + if employee_id == row.id: + customer_found = True + # Check that the `name` field was masked with the override provided in the dataset + assert row.name == "testing-test" + assert customer_found + + @pytest.mark.integration_mssql @pytest.mark.integration @pytest.mark.parametrize( @@ -2058,7 +2112,7 @@ def test_create_and_process_erasure_request_bigquery( dsr_version, request, bigquery_fixtures, - erasure_policy, + biquery_erasure_policy, run_privacy_request_task, ): request.getfixturevalue(dsr_version) # REQUIRED to test both DSR 3.0 and 2.0 @@ -2077,14 +2131,14 @@ def test_create_and_process_erasure_request_bigquery( customer_email = bigquery_resources["email"] data = { "requested_at": "2021-08-30T16:09:37.359Z", - "policy_key": erasure_policy.key, + "policy_key": biquery_erasure_policy.key, "identity": {"email": customer_email}, } # Should erase customer name pr = get_privacy_request_results( db, - erasure_policy, + biquery_erasure_policy, run_privacy_request_task, data, task_timeout=PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, @@ -2108,14 +2162,15 @@ def test_create_and_process_erasure_request_bigquery( assert row.city == bigquery_resources["city"] assert row.state == bigquery_resources["state"] - target = erasure_policy.rules[0].targets[0] - target.data_category = "user.contact.address.state" - target.save(db=db) + for target in biquery_erasure_policy.rules[0].targets: + if target.data_category == "user.name": + target.data_category = "user.contact.address.state" + target.save(db=db) # Should erase state fields on address table pr = get_privacy_request_results( db, - erasure_policy, + biquery_erasure_policy, run_privacy_request_task, data, task_timeout=PRIVACY_REQUEST_TASK_TIMEOUT_EXTERNAL, @@ -2124,19 +2179,14 @@ def test_create_and_process_erasure_request_bigquery( bigquery_client = bigquery_resources["client"] with bigquery_client.connect() as connection: address_id = bigquery_resources["address_id"] - stmt = f"select 'id', city, state from fidesopstest.address where id = {address_id};" - res = connection.execute(stmt).all() - for row in res: - # State field was targeted by erasure policy but city was not - assert row.city is not None - assert row.state is None - - stmt = f"select 'id', city, state from fidesopstest.address where id = {address_id};" + stmt = f"select 'id', city, state, street from fidesopstest.address where id = {address_id};" res = connection.execute(stmt).all() for row in res: # State field was targeted by erasure policy but city was not assert row.city is not None assert row.state is None + # Street field was targeted by erasure policy but overridden by field-level masking_strategy_override + assert row.street == "REDACTED" stmt = f"select * from fidesopstest.employee where address_id = {bigquery_resources['address_id']};" res = connection.execute(stmt).all() diff --git a/tests/ops/task/test_create_request_tasks.py b/tests/ops/task/test_create_request_tasks.py index f3cf0b75f6..1ba24431cb 100644 --- a/tests/ops/task/test_create_request_tasks.py +++ b/tests/ops/task/test_create_request_tasks.py @@ -54,6 +54,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "ccn", @@ -67,6 +68,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "code", @@ -80,6 +82,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "customer_id", @@ -93,6 +96,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "id", @@ -106,6 +110,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "name", @@ -119,6 +124,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "preferred", @@ -132,6 +138,7 @@ "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, ], "erase_after": [], @@ -305,6 +312,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "object_id", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "customer_identifiers", @@ -326,6 +334,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "string", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, "derived_phone": { "name": "derived_phone", @@ -339,6 +348,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "string", "return_all_elements": True, "custom_request_field": None, + "masking_strategy_override": None, }, "derived_emails": { "name": "derived_emails", @@ -352,6 +362,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "string", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, }, "length": None, @@ -364,6 +375,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "object", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "derived_interests", @@ -377,6 +389,7 @@ def test_persist_access_tasks_with_object_fields_in_collection( "data_type_converter": "string", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, ], "erase_after": [], @@ -913,6 +926,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "data_type_converter": "integer", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, { "name": "email", @@ -926,6 +940,7 @@ def test_erase_after_saas_upstream_and_downstream_tasks( "data_type_converter": "None", "return_all_elements": None, "custom_request_field": None, + "masking_strategy_override": None, }, ] assert not serialized_collection["skip_processing"]