Skip to content

Commit

Permalink
Add metadata anonymization to public SDV (#2141)
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Jul 25, 2024
1 parent a494d4a commit 9c5d3c9
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 0 deletions.
41 changes: 41 additions & 0 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,47 @@ def get_table_metadata(self, table_name):
self._validate_table_exists(table_name)
return deepcopy(self.tables[table_name])

def anonymize(self):
"""Anonymize metadata by obfuscating column names.
Returns:
MultiTableMetadata:
An anonymized MultiTableMetadata instance.
"""
anonymized_metadata = {'tables': {}, 'relationships': []}
anonymized_table_map = {}
counter = 1
for table, table_metadata in self.tables.items():
anonymized_table_name = f'table{counter}'
anonymized_table_map[table] = anonymized_table_name

anonymized_metadata['tables'][anonymized_table_name] = (
table_metadata.anonymize().to_dict()
)
counter += 1

for relationship in self.relationships:
parent_table = relationship['parent_table_name']
anonymized_parent_table = anonymized_table_map[parent_table]

child_table = relationship['child_table_name']
anonymized_child_table = anonymized_table_map[child_table]

foreign_key = relationship['child_foreign_key']
anonymized_foreign_key = self.tables[child_table]._anonymized_column_map[foreign_key]

primary_key = relationship['parent_primary_key']
anonymized_primary_key = self.tables[parent_table]._anonymized_column_map[primary_key]

anonymized_metadata['relationships'].append({
'parent_table_name': anonymized_parent_table,
'child_table_name': anonymized_child_table,
'child_foreign_key': anonymized_foreign_key,
'parent_primary_key': anonymized_primary_key,
})

return MultiTableMetadata.load_from_dict(anonymized_metadata)

def visualize(
self, show_table_details='full', show_relationship_labels=True, output_filepath=None
):
Expand Down
35 changes: 35 additions & 0 deletions sdv/metadata/single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,41 @@ def validate_data(self, data, sdtype_warnings=None):
if errors:
raise InvalidDataError(errors)

def anonymize(self):
"""Anonymize metadata by obfuscating column names.
Returns:
SingleTableMetadata:
An anonymized SingleTableMetadata instance.
"""
anonymized_metadata = {'columns': {}}

self._anonymized_column_map = {}
counter = 1
for column, column_metadata in self.columns.items():
anonymized_column = f'col{counter}'
self._anonymized_column_map[column] = anonymized_column
anonymized_metadata['columns'][anonymized_column] = column_metadata
counter += 1

if self.primary_key:
anonymized_metadata['primary_key'] = self._anonymized_column_map[self.primary_key]

if self.alternate_keys:
anonymized_alternate_keys = []
for alternate_key in self.alternate_keys:
anonymized_alternate_keys.append(self._anonymized_column_map[alternate_key])

anonymized_metadata['alternate_keys'] = anonymized_alternate_keys

if self.sequence_key:
anonymized_metadata['sequence_key'] = self._anonymized_column_map[self.sequence_key]

if self.sequence_index:
anonymized_metadata['sequence_index'] = self._anonymized_column_map[self.sequence_index]

return SingleTableMetadata.load_from_dict(anonymized_metadata)

def visualize(self, show_table_details='full', output_filepath=None):
"""Create a visualization of the single-table dataset.
Expand Down
57 changes: 57 additions & 0 deletions tests/integration/metadata/test_multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,3 +375,60 @@ def test_get_table_metadata():
'column_relationships': [{'type': 'gps', 'column_names': ['latitude', 'longitude']}],
}
assert table_metadata.to_dict() == expected_metadata


def test_anonymize():
"""Test the ``anonymize`` method."""
# Setup
metadata_dict = {
'tables': {
'real_table1': {
'columns': {
'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
'table1_column2': {'sdtype': 'categorical'},
},
'primary_key': 'table1_primary_key',
},
'real_table2': {
'columns': {
'table2_primary_key': {'sdtype': 'email'},
'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
},
'primary_key': 'table2_primary_key',
},
},
'relationships': [
{
'parent_table_name': 'real_table1',
'parent_primary_key': 'table1_primary_key',
'child_table_name': 'real_table2',
'child_foreign_key': 'table2_foreign_key',
}
],
}
metadata = MultiTableMetadata.load_from_dict(metadata_dict)
table1_metadata = metadata.tables['real_table1']
table2_metadata = metadata.tables['real_table2']
metadata.validate()

# Run
anonymized = metadata.anonymize()

# Assert
anonymized.validate()

assert anonymized.tables.keys() == {'table1', 'table2'}
assert len(anonymized.relationships) == len(metadata.relationships)
assert anonymized.relationships[0]['parent_table_name'] == 'table1'
assert anonymized.relationships[0]['child_table_name'] == 'table2'
assert anonymized.relationships[0]['parent_primary_key'] == 'col1'
assert anonymized.relationships[0]['child_foreign_key'] == 'col2'

anon_primary_key_metadata = anonymized.tables['table1'].columns['col1']
assert anon_primary_key_metadata == table1_metadata.columns['table1_primary_key']

anon_foreign_key_metadata = anonymized.tables['table2'].columns['col2']
assert anon_foreign_key_metadata == table2_metadata.columns['table2_foreign_key']

assert anonymized.tables['table1'].to_dict() == table1_metadata.anonymize().to_dict()
assert anonymized.tables['table2'].to_dict() == table2_metadata.anonymize().to_dict()
46 changes: 46 additions & 0 deletions tests/integration/metadata/test_single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,3 +551,49 @@ def test_metadata_set_same_sequence_primary():
)
with pytest.raises(InvalidMetadataError, match=error_msg_sequence):
metadata_primary.set_sequence_key('A')


def test_anonymize():
"""Test the ``anonymize`` method."""
# Setup
metadata_dict = {
'columns': {
'primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
'sequence_index': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'sequence_key': {'sdtype': 'id'},
'alternate_id1': {'sdtype': 'email', 'pii': True},
'alternate_id2': {'sdtype': 'name', 'pii': True},
'numerical': {'sdtype': 'numerical', 'computer_representation': 'Float'},
'categorical': {'sdtype': 'categorical'},
},
'primary_key': 'primary_key',
'sequence_index': 'sequence_index',
'sequence_key': 'sequence_key',
'alternate_keys': ['alternate_id1', 'alternate_id2'],
}
metadata = SingleTableMetadata.load_from_dict(metadata_dict)
metadata.validate()

# Run
anonymized = metadata.anonymize()

# Assert
anonymized.validate()

assert all(original_col not in anonymized.columns for original_col in metadata.columns)
for original_col, anonymized_col in metadata._anonymized_column_map.items():
assert metadata.columns[original_col] == anonymized.columns[anonymized_col]

anon_primary_key = anonymized.primary_key
assert anonymized.columns[anon_primary_key] == metadata.columns['primary_key']

anon_alternate_keys = anonymized.alternate_keys
assert len(anon_alternate_keys) == len(metadata.alternate_keys)
assert anonymized.columns[anon_alternate_keys[0]] == metadata.columns['alternate_id1']
assert anonymized.columns[anon_alternate_keys[1]] == metadata.columns['alternate_id2']

anon_sequence_index = anonymized.sequence_index
assert anonymized.columns[anon_sequence_index] == metadata.columns['sequence_index']

anon_sequence_key = anonymized.sequence_key
assert anonymized.columns[anon_sequence_key] == metadata.columns['sequence_key']
41 changes: 41 additions & 0 deletions tests/unit/metadata/test_multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3079,3 +3079,44 @@ def test_upgrade_metadata_validate_error(
'Successfully converted the old metadata, but the metadata was not valid.'
'To use this with the SDV, please fix the following errors.\n blah'
)

@patch('sdv.metadata.multi_table.SingleTableMetadata.load_from_dict')
def test_anonymize(self, mock_load):
"""Test ``anonymize`` method."""
# Setup
table1 = Mock()
mock_load.return_value = Mock()
table1._anonymized_column_map = {'table1_primary_key': 'col1'}
table2 = Mock()
table2._anonymized_column_map = {
'table2_primary_key': 'col1',
'table2_foreign_key': 'col2',
}
instance = MultiTableMetadata()
instance.tables = {
'real_table1': table1,
'real_table2': table2,
}

instance.relationships = [
{
'parent_table_name': 'real_table1',
'child_table_name': 'real_table2',
'parent_primary_key': 'table1_primary_key',
'child_foreign_key': 'table2_foreign_key',
}
]

# Run
anonymized = instance.anonymize()

# Assert
table1.anonymize.assert_called_once_with()
table2.anonymize.assert_called_once_with()
assert anonymized.tables.keys() == {'table1', 'table2'}
assert anonymized.relationships[0] == {
'parent_table_name': 'table1',
'child_table_name': 'table2',
'parent_primary_key': 'col1',
'child_foreign_key': 'col2',
}
39 changes: 39 additions & 0 deletions tests/unit/metadata/test_single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3210,3 +3210,42 @@ def test_upgrade_metadata_validate_error(
'Successfully converted the old metadata, but the metadata was not valid. '
'To use this with the SDV, please fix the following errors.\n blah'
)

def test_anonymize(self):
"""Test the ``anonymize`` method."""
# Setup
instance = SingleTableMetadata()
instance.columns = {
'real_column1': {'sdtype': 'id', 'regex_format': r'\d{30}'},
'real_column2': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'real_column3': {'sdtype': 'numerical'},
'real_column4': {'sdtype': 'id'},
}
instance.primary_key = 'real_column1'
instance.alternate_keys = ['real_column4']
instance.sequence_index = 'real_column2'
instance.sequence_key = 'real_column4'

# Run
anonymized = instance.anonymize()

# Assert
anonymized.validate()

assert all(original_col not in anonymized.columns for original_col in instance.columns)
for original_col, anonymized_col in instance._anonymized_column_map.items():
assert instance.columns[original_col] == anonymized.columns[anonymized_col]

anon_primary_key = anonymized.primary_key
assert anonymized.columns[anon_primary_key] == instance.columns['real_column1']

anon_alternate_keys = anonymized.alternate_keys
assert anonymized.columns[anon_alternate_keys[0]] == instance.columns['real_column4']

anon_sequence_index = anonymized.sequence_index
assert anonymized.columns[anon_sequence_index] == instance.columns['real_column2']

anon_sequence_key = anonymized.sequence_key
assert anonymized.columns[anon_sequence_key] == instance.columns['real_column4']

assert anon_alternate_keys[0] == anon_sequence_key

0 comments on commit 9c5d3c9

Please sign in to comment.