diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 2606b990a..cd113a96f 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -914,6 +914,47 @@ def get_table_metadata(self, table_name): self._validate_table_exists(table_name) return deepcopy(self.tables[table_name]) + def anonymize(self): + """Anonymize metadata by obfuscating column names. + + Returns: + MultiTableMetadata: + An anonymized MultiTableMetadata instance. + """ + anonymized_metadata = {'tables': {}, 'relationships': []} + anonymized_table_map = {} + counter = 1 + for table, table_metadata in self.tables.items(): + anonymized_table_name = f'table{counter}' + anonymized_table_map[table] = anonymized_table_name + + anonymized_metadata['tables'][anonymized_table_name] = ( + table_metadata.anonymize().to_dict() + ) + counter += 1 + + for relationship in self.relationships: + parent_table = relationship['parent_table_name'] + anonymized_parent_table = anonymized_table_map[parent_table] + + child_table = relationship['child_table_name'] + anonymized_child_table = anonymized_table_map[child_table] + + foreign_key = relationship['child_foreign_key'] + anonymized_foreign_key = self.tables[child_table]._anonymized_column_map[foreign_key] + + primary_key = relationship['parent_primary_key'] + anonymized_primary_key = self.tables[parent_table]._anonymized_column_map[primary_key] + + anonymized_metadata['relationships'].append({ + 'parent_table_name': anonymized_parent_table, + 'child_table_name': anonymized_child_table, + 'child_foreign_key': anonymized_foreign_key, + 'parent_primary_key': anonymized_primary_key, + }) + + return MultiTableMetadata.load_from_dict(anonymized_metadata) + def visualize( self, show_table_details='full', show_relationship_labels=True, output_filepath=None ): diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 7ff416aab..ac77f9ff9 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -1218,6 +1218,41 @@ def validate_data(self, data, sdtype_warnings=None): if errors: raise InvalidDataError(errors) + def anonymize(self): + """Anonymize metadata by obfuscating column names. + + Returns: + SingleTableMetadata: + An anonymized SingleTableMetadata instance. + """ + anonymized_metadata = {'columns': {}} + + self._anonymized_column_map = {} + counter = 1 + for column, column_metadata in self.columns.items(): + anonymized_column = f'col{counter}' + self._anonymized_column_map[column] = anonymized_column + anonymized_metadata['columns'][anonymized_column] = column_metadata + counter += 1 + + if self.primary_key: + anonymized_metadata['primary_key'] = self._anonymized_column_map[self.primary_key] + + if self.alternate_keys: + anonymized_alternate_keys = [] + for alternate_key in self.alternate_keys: + anonymized_alternate_keys.append(self._anonymized_column_map[alternate_key]) + + anonymized_metadata['alternate_keys'] = anonymized_alternate_keys + + if self.sequence_key: + anonymized_metadata['sequence_key'] = self._anonymized_column_map[self.sequence_key] + + if self.sequence_index: + anonymized_metadata['sequence_index'] = self._anonymized_column_map[self.sequence_index] + + return SingleTableMetadata.load_from_dict(anonymized_metadata) + def visualize(self, show_table_details='full', output_filepath=None): """Create a visualization of the single-table dataset. diff --git a/tests/integration/metadata/test_multi_table.py b/tests/integration/metadata/test_multi_table.py index 50cc16c0d..4777004eb 100644 --- a/tests/integration/metadata/test_multi_table.py +++ b/tests/integration/metadata/test_multi_table.py @@ -375,3 +375,60 @@ def test_get_table_metadata(): 'column_relationships': [{'type': 'gps', 'column_names': ['latitude', 'longitude']}], } assert table_metadata.to_dict() == expected_metadata + + +def test_anonymize(): + """Test the ``anonymize`` method.""" + # Setup + metadata_dict = { + 'tables': { + 'real_table1': { + 'columns': { + 'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'}, + 'table1_column2': {'sdtype': 'categorical'}, + }, + 'primary_key': 'table1_primary_key', + }, + 'real_table2': { + 'columns': { + 'table2_primary_key': {'sdtype': 'email'}, + 'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'}, + }, + 'primary_key': 'table2_primary_key', + }, + }, + 'relationships': [ + { + 'parent_table_name': 'real_table1', + 'parent_primary_key': 'table1_primary_key', + 'child_table_name': 'real_table2', + 'child_foreign_key': 'table2_foreign_key', + } + ], + } + metadata = MultiTableMetadata.load_from_dict(metadata_dict) + table1_metadata = metadata.tables['real_table1'] + table2_metadata = metadata.tables['real_table2'] + metadata.validate() + + # Run + anonymized = metadata.anonymize() + + # Assert + anonymized.validate() + + assert anonymized.tables.keys() == {'table1', 'table2'} + assert len(anonymized.relationships) == len(metadata.relationships) + assert anonymized.relationships[0]['parent_table_name'] == 'table1' + assert anonymized.relationships[0]['child_table_name'] == 'table2' + assert anonymized.relationships[0]['parent_primary_key'] == 'col1' + assert anonymized.relationships[0]['child_foreign_key'] == 'col2' + + anon_primary_key_metadata = anonymized.tables['table1'].columns['col1'] + assert anon_primary_key_metadata == table1_metadata.columns['table1_primary_key'] + + anon_foreign_key_metadata = anonymized.tables['table2'].columns['col2'] + assert anon_foreign_key_metadata == table2_metadata.columns['table2_foreign_key'] + + assert anonymized.tables['table1'].to_dict() == table1_metadata.anonymize().to_dict() + assert anonymized.tables['table2'].to_dict() == table2_metadata.anonymize().to_dict() diff --git a/tests/integration/metadata/test_single_table.py b/tests/integration/metadata/test_single_table.py index e709c714b..c0002c48a 100644 --- a/tests/integration/metadata/test_single_table.py +++ b/tests/integration/metadata/test_single_table.py @@ -551,3 +551,49 @@ def test_metadata_set_same_sequence_primary(): ) with pytest.raises(InvalidMetadataError, match=error_msg_sequence): metadata_primary.set_sequence_key('A') + + +def test_anonymize(): + """Test the ``anonymize`` method.""" + # Setup + metadata_dict = { + 'columns': { + 'primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'}, + 'sequence_index': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}, + 'sequence_key': {'sdtype': 'id'}, + 'alternate_id1': {'sdtype': 'email', 'pii': True}, + 'alternate_id2': {'sdtype': 'name', 'pii': True}, + 'numerical': {'sdtype': 'numerical', 'computer_representation': 'Float'}, + 'categorical': {'sdtype': 'categorical'}, + }, + 'primary_key': 'primary_key', + 'sequence_index': 'sequence_index', + 'sequence_key': 'sequence_key', + 'alternate_keys': ['alternate_id1', 'alternate_id2'], + } + metadata = SingleTableMetadata.load_from_dict(metadata_dict) + metadata.validate() + + # Run + anonymized = metadata.anonymize() + + # Assert + anonymized.validate() + + assert all(original_col not in anonymized.columns for original_col in metadata.columns) + for original_col, anonymized_col in metadata._anonymized_column_map.items(): + assert metadata.columns[original_col] == anonymized.columns[anonymized_col] + + anon_primary_key = anonymized.primary_key + assert anonymized.columns[anon_primary_key] == metadata.columns['primary_key'] + + anon_alternate_keys = anonymized.alternate_keys + assert len(anon_alternate_keys) == len(metadata.alternate_keys) + assert anonymized.columns[anon_alternate_keys[0]] == metadata.columns['alternate_id1'] + assert anonymized.columns[anon_alternate_keys[1]] == metadata.columns['alternate_id2'] + + anon_sequence_index = anonymized.sequence_index + assert anonymized.columns[anon_sequence_index] == metadata.columns['sequence_index'] + + anon_sequence_key = anonymized.sequence_key + assert anonymized.columns[anon_sequence_key] == metadata.columns['sequence_key'] diff --git a/tests/unit/metadata/test_multi_table.py b/tests/unit/metadata/test_multi_table.py index c4a43de2c..45771c3d4 100644 --- a/tests/unit/metadata/test_multi_table.py +++ b/tests/unit/metadata/test_multi_table.py @@ -3079,3 +3079,44 @@ def test_upgrade_metadata_validate_error( 'Successfully converted the old metadata, but the metadata was not valid.' 'To use this with the SDV, please fix the following errors.\n blah' ) + + @patch('sdv.metadata.multi_table.SingleTableMetadata.load_from_dict') + def test_anonymize(self, mock_load): + """Test ``anonymize`` method.""" + # Setup + table1 = Mock() + mock_load.return_value = Mock() + table1._anonymized_column_map = {'table1_primary_key': 'col1'} + table2 = Mock() + table2._anonymized_column_map = { + 'table2_primary_key': 'col1', + 'table2_foreign_key': 'col2', + } + instance = MultiTableMetadata() + instance.tables = { + 'real_table1': table1, + 'real_table2': table2, + } + + instance.relationships = [ + { + 'parent_table_name': 'real_table1', + 'child_table_name': 'real_table2', + 'parent_primary_key': 'table1_primary_key', + 'child_foreign_key': 'table2_foreign_key', + } + ] + + # Run + anonymized = instance.anonymize() + + # Assert + table1.anonymize.assert_called_once_with() + table2.anonymize.assert_called_once_with() + assert anonymized.tables.keys() == {'table1', 'table2'} + assert anonymized.relationships[0] == { + 'parent_table_name': 'table1', + 'child_table_name': 'table2', + 'parent_primary_key': 'col1', + 'child_foreign_key': 'col2', + } diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 0553c0b95..c5b979d4f 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -3210,3 +3210,42 @@ def test_upgrade_metadata_validate_error( 'Successfully converted the old metadata, but the metadata was not valid. ' 'To use this with the SDV, please fix the following errors.\n blah' ) + + def test_anonymize(self): + """Test the ``anonymize`` method.""" + # Setup + instance = SingleTableMetadata() + instance.columns = { + 'real_column1': {'sdtype': 'id', 'regex_format': r'\d{30}'}, + 'real_column2': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'}, + 'real_column3': {'sdtype': 'numerical'}, + 'real_column4': {'sdtype': 'id'}, + } + instance.primary_key = 'real_column1' + instance.alternate_keys = ['real_column4'] + instance.sequence_index = 'real_column2' + instance.sequence_key = 'real_column4' + + # Run + anonymized = instance.anonymize() + + # Assert + anonymized.validate() + + assert all(original_col not in anonymized.columns for original_col in instance.columns) + for original_col, anonymized_col in instance._anonymized_column_map.items(): + assert instance.columns[original_col] == anonymized.columns[anonymized_col] + + anon_primary_key = anonymized.primary_key + assert anonymized.columns[anon_primary_key] == instance.columns['real_column1'] + + anon_alternate_keys = anonymized.alternate_keys + assert anonymized.columns[anon_alternate_keys[0]] == instance.columns['real_column4'] + + anon_sequence_index = anonymized.sequence_index + assert anonymized.columns[anon_sequence_index] == instance.columns['real_column2'] + + anon_sequence_key = anonymized.sequence_key + assert anonymized.columns[anon_sequence_key] == instance.columns['real_column4'] + + assert anon_alternate_keys[0] == anon_sequence_key