Skip to content

Commit

Permalink
Metadata anonymize doesn't produce the right METADATA_SPEC_VERSION (
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Jan 10, 2025
1 parent 10d40fc commit 7384de0
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 7 deletions.
11 changes: 11 additions & 0 deletions sdv/metadata/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,14 @@ def get_table_metadata(self, table_name=None):
table_name = self._handle_table_name(table_name)
table_metadata = super().get_table_metadata(table_name)
return Metadata.load_from_dict(table_metadata.to_dict(), single_table_name=table_name)

def anonymize(self):
"""Anonymize metadata by obfuscating column names.
Returns:
MultiTableMetadata:
An anonymized MultiTableMetadata instance.
"""
anonymized_metadata = self._get_anonymized_dict()

return Metadata.load_from_dict(anonymized_metadata)
19 changes: 12 additions & 7 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,13 +914,7 @@ def get_table_metadata(self, table_name):
self._validate_table_exists(table_name)
return deepcopy(self.tables[table_name])

def anonymize(self):
"""Anonymize metadata by obfuscating column names.
Returns:
MultiTableMetadata:
An anonymized MultiTableMetadata instance.
"""
def _get_anonymized_dict(self):
anonymized_metadata = {'tables': {}, 'relationships': []}
anonymized_table_map = {}
counter = 1
Expand Down Expand Up @@ -953,6 +947,17 @@ def anonymize(self):
'parent_primary_key': anonymized_primary_key,
})

return anonymized_metadata

def anonymize(self):
"""Anonymize metadata by obfuscating column names.
Returns:
MultiTableMetadata:
An anonymized MultiTableMetadata instance.
"""
anonymized_metadata = self._get_anonymized_dict()

return MultiTableMetadata.load_from_dict(anonymized_metadata)

def visualize(
Expand Down
58 changes: 58 additions & 0 deletions tests/integration/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,61 @@ def test_any_metadata_update_multi_table(method, args, kwargs):
assert expected_dict != metadata_before
else:
assert result == ['checkin_date', 'checkout_date']


def test_anonymize():
"""Test the ``anonymize`` method."""
# Setup
metadata_dict = {
'tables': {
'real_table1': {
'columns': {
'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
'table1_column2': {'sdtype': 'categorical'},
},
'primary_key': 'table1_primary_key',
},
'real_table2': {
'columns': {
'table2_primary_key': {'sdtype': 'email'},
'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
},
'primary_key': 'table2_primary_key',
},
},
'relationships': [
{
'parent_table_name': 'real_table1',
'parent_primary_key': 'table1_primary_key',
'child_table_name': 'real_table2',
'child_foreign_key': 'table2_foreign_key',
}
],
}
metadata = Metadata.load_from_dict(metadata_dict)
table1_metadata = metadata.tables['real_table1']
table2_metadata = metadata.tables['real_table2']
metadata.validate()

# Run
anonymized = metadata.anonymize()

# Assert
anonymized.validate()

assert anonymized.METADATA_SPEC_VERSION == 'V1'
assert anonymized.tables.keys() == {'table1', 'table2'}
assert len(anonymized.relationships) == len(metadata.relationships)
assert anonymized.relationships[0]['parent_table_name'] == 'table1'
assert anonymized.relationships[0]['child_table_name'] == 'table2'
assert anonymized.relationships[0]['parent_primary_key'] == 'col1'
assert anonymized.relationships[0]['child_foreign_key'] == 'col2'

anon_primary_key_metadata = anonymized.tables['table1'].columns['col1']
assert anon_primary_key_metadata == table1_metadata.columns['table1_primary_key']

anon_foreign_key_metadata = anonymized.tables['table2'].columns['col2']
assert anon_foreign_key_metadata == table2_metadata.columns['table2_foreign_key']

assert anonymized.tables['table1'].to_dict() == table1_metadata.anonymize().to_dict()
assert anonymized.tables['table2'].to_dict() == table2_metadata.anonymize().to_dict()
1 change: 1 addition & 0 deletions tests/integration/metadata/test_multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def test_anonymize():
# Assert
anonymized.validate()

assert anonymized.METADATA_SPEC_VERSION == 'MULTI_TABLE_V1'
assert anonymized.tables.keys() == {'table1', 'table2'}
assert len(anonymized.relationships) == len(metadata.relationships)
assert anonymized.relationships[0]['parent_table_name'] == 'table1'
Expand Down
15 changes: 15 additions & 0 deletions tests/unit/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,3 +717,18 @@ def test__handle_table_name_with_empty_tables(self):
error_msg = 'Metadata does not contain any tables. No columns can be added.'
with pytest.raises(ValueError, match=error_msg):
instance._handle_table_name(None)

@patch('sdv.metadata.metadata.Metadata.load_from_dict')
def test_anonymize(self, mock_load_from_dict):
"""Test that the `anonymize` method."""
# Setup
metadata = Metadata()
metadata._get_anonymized_dict = Mock(return_value={})
metadata.load_from_dict = Mock()

# Run
metadata.anonymize()

# Assert
metadata._get_anonymized_dict.assert_called_once()
mock_load_from_dict.assert_called_once_with({})
80 changes: 80 additions & 0 deletions tests/unit/metadata/test_multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3160,6 +3160,86 @@ def test_anonymize(self, mock_load):
'child_foreign_key': 'col2',
}

def test__get_anonymized_dict(self):
"""Test the ``_get_anonymized_dict`` method."""
# Setup
metadata_dict = {
'tables': {
'real_table1': {
'columns': {
'table1_primary_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
'table1_column2': {'sdtype': 'categorical'},
},
'primary_key': 'table1_primary_key',
},
'real_table2': {
'columns': {
'table2_primary_key': {'sdtype': 'email'},
'table2_foreign_key': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
},
'primary_key': 'table2_primary_key',
},
},
'relationships': [
{
'parent_table_name': 'real_table1',
'parent_primary_key': 'table1_primary_key',
'child_table_name': 'real_table2',
'child_foreign_key': 'table2_foreign_key',
}
],
}
metadata = MultiTableMetadata.load_from_dict(metadata_dict)

# Run
anonymized_dict = metadata._get_anonymized_dict()

# Assert
expected_anonymized_dict = {
'tables': {
'table1': {
'columns': {
'col1': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
'col2': {'sdtype': 'categorical'},
},
'primary_key': 'col1',
'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
},
'table2': {
'columns': {
'col1': {'sdtype': 'email'},
'col2': {'sdtype': 'id', 'regex_format': 'ID_[0-9]{3}'},
},
'primary_key': 'col1',
'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
},
},
'relationships': [
{
'parent_table_name': 'table1',
'child_table_name': 'table2',
'child_foreign_key': 'col2',
'parent_primary_key': 'col1',
}
],
}
assert anonymized_dict == expected_anonymized_dict

@patch('sdv.metadata.metadata.MultiTableMetadata.load_from_dict')
def test_anonymize_mock(self, mock_load_from_dict):
"""Test that the `anonymize` method."""
# Setup
metadata = MultiTableMetadata()
metadata._get_anonymized_dict = Mock(return_value={})
metadata.load_from_dict = Mock()

# Run
metadata.anonymize()

# Assert
metadata._get_anonymized_dict.assert_called_once()
mock_load_from_dict.assert_called_once_with({})

def test_update_columns_no_list_error(self):
"""Test that ``update_columns`` only takes in list and that an error is thrown."""
# Setup
Expand Down

0 comments on commit 7384de0

Please sign in to comment.