From 618f0031f46520fe7659302a0cef3a53f86b9c56 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 5 Aug 2024 15:00:55 +0200 Subject: [PATCH] move unit tests to integration tests --- tests/integration/multi_table/test_hma.py | 110 ++++++++++++++++++++++ tests/unit/multi_table/test_hma.py | 108 --------------------- 2 files changed, 110 insertions(+), 108 deletions(-) diff --git a/tests/integration/multi_table/test_hma.py b/tests/integration/multi_table/test_hma.py index e6fa27e2e..a774f51c4 100644 --- a/tests/integration/multi_table/test_hma.py +++ b/tests/integration/multi_table/test_hma.py @@ -4,6 +4,7 @@ import math import re import warnings +from unittest.mock import Mock import faker import numpy as np @@ -2019,3 +2020,112 @@ def test_fit_int_primary_key_regex_includes_zero(regex): ) with pytest.raises(SynthesizerInputError, match=message): instance.fit(data) + + +def test__estimate_num_columns_to_be_modeled_various_sdtypes(): + """Test the estimated number of columns is correct for various sdtypes. + + To check that the number columns is correct we Mock the ``_finalize`` method + and compare its output with the estimated number of columns. + + The dataset used follows the structure below: + R1 R2 + | / + GP + | + P + """ + # Setup + root1 = pd.DataFrame({'R1': [0, 1, 2]}) + root2 = pd.DataFrame({'R2': [0, 1, 2], 'data': [0, 1, 2]}) + grandparent = pd.DataFrame({'GP': [0, 1, 2], 'R1': [0, 1, 2], 'R2': [0, 1, 2]}) + parent = pd.DataFrame({ + 'P': [0, 1, 2], + 'GP': [0, 1, 2], + 'numerical': [0.1, 0.5, np.nan], + 'categorical': ['a', np.nan, 'c'], + 'datetime': [None, '2019-01-02', '2019-01-03'], + 'boolean': [float('nan'), False, True], + 'id': [0, 1, 2], + }) + data = { + 'root1': root1, + 'root2': root2, + 'grandparent': grandparent, + 'parent': parent, + } + metadata = MultiTableMetadata.load_from_dict({ + 'tables': { + 'root1': { + 'primary_key': 'R1', + 'columns': { + 'R1': {'sdtype': 'id'}, + }, + }, + 'root2': { + 'primary_key': 'R2', + 'columns': {'R2': {'sdtype': 'id'}, 'data': {'sdtype': 'numerical'}}, + }, + 'grandparent': { + 'primary_key': 'GP', + 'columns': { + 'GP': {'sdtype': 'id'}, + 'R1': {'sdtype': 'id'}, + 'R2': {'sdtype': 'id'}, + }, + }, + 'parent': { + 'primary_key': 'P', + 'columns': { + 'P': {'sdtype': 'id'}, + 'GP': {'sdtype': 'id'}, + 'numerical': {'sdtype': 'numerical'}, + 'categorical': {'sdtype': 'categorical'}, + 'datetime': {'sdtype': 'datetime'}, + 'boolean': {'sdtype': 'boolean'}, + 'id': {'sdtype': 'id'}, + }, + }, + }, + 'relationships': [ + { + 'parent_table_name': 'root1', + 'parent_primary_key': 'R1', + 'child_table_name': 'grandparent', + 'child_foreign_key': 'R1', + }, + { + 'parent_table_name': 'root2', + 'parent_primary_key': 'R2', + 'child_table_name': 'grandparent', + 'child_foreign_key': 'R2', + }, + { + 'parent_table_name': 'grandparent', + 'parent_primary_key': 'GP', + 'child_table_name': 'parent', + 'child_foreign_key': 'GP', + }, + ], + }) + synthesizer = HMASynthesizer(metadata) + synthesizer._finalize = Mock(return_value=data) + + # Run estimation + estimated_num_columns = synthesizer._estimate_num_columns(metadata) + + # Run actual modeling + synthesizer.fit(data) + synthesizer.sample() + + # Assert estimated number of columns is correct + tables = synthesizer._finalize.call_args[0][0] + for table_name, table in tables.items(): + # Subract all the id columns present in the data, as those are not estimated + num_table_cols = len(table.columns) + if table_name in {'parent', 'grandparent'}: + num_table_cols -= 3 + if table_name in {'root1', 'root2'}: + num_table_cols -= 1 + + assert num_table_cols == estimated_num_columns[table_name] diff --git a/tests/unit/multi_table/test_hma.py b/tests/unit/multi_table/test_hma.py index 51d5aa1a4..0db06ea82 100644 --- a/tests/unit/multi_table/test_hma.py +++ b/tests/unit/multi_table/test_hma.py @@ -1119,111 +1119,3 @@ def test__estimate_num_columns_to_be_modeled(self): num_table_cols -= 1 assert num_table_cols == estimated_num_columns[table_name] - - def test__estimate_num_columns_to_be_modeled_various_sdtypes(self): - """Test the estimated number of columns is correct for various sdtypes. - - To check that the number columns is correct we Mock the ``_finalize`` method - and compare its output with the estimated number of columns. - - The dataset used follows the structure below: - R1 R2 - | / - GP - | - P - """ - # Setup - root1 = pd.DataFrame({'R1': [0, 1, 2]}) - root2 = pd.DataFrame({'R2': [0, 1, 2], 'data': [0, 1, 2]}) - grandparent = pd.DataFrame({'GP': [0, 1, 2], 'R1': [0, 1, 2], 'R2': [0, 1, 2]}) - parent = pd.DataFrame({ - 'P': [0, 1, 2], - 'GP': [0, 1, 2], - 'numerical': [0.1, 0.5, np.nan], - 'categorical': ['a', np.nan, 'c'], - 'datetime': [None, '2019-01-02', '2019-01-03'], - 'boolean': [float('nan'), False, True], - 'id': [0, 1, 2], - }) - data = { - 'root1': root1, - 'root2': root2, - 'grandparent': grandparent, - 'parent': parent, - } - metadata = MultiTableMetadata.load_from_dict({ - 'tables': { - 'root1': { - 'primary_key': 'R1', - 'columns': { - 'R1': {'sdtype': 'id'}, - }, - }, - 'root2': { - 'primary_key': 'R2', - 'columns': {'R2': {'sdtype': 'id'}, 'data': {'sdtype': 'numerical'}}, - }, - 'grandparent': { - 'primary_key': 'GP', - 'columns': { - 'GP': {'sdtype': 'id'}, - 'R1': {'sdtype': 'id'}, - 'R2': {'sdtype': 'id'}, - }, - }, - 'parent': { - 'primary_key': 'P', - 'columns': { - 'P': {'sdtype': 'id'}, - 'GP': {'sdtype': 'id'}, - 'numerical': {'sdtype': 'numerical'}, - 'categorical': {'sdtype': 'categorical'}, - 'datetime': {'sdtype': 'datetime'}, - 'boolean': {'sdtype': 'boolean'}, - 'id': {'sdtype': 'id'}, - }, - }, - }, - 'relationships': [ - { - 'parent_table_name': 'root1', - 'parent_primary_key': 'R1', - 'child_table_name': 'grandparent', - 'child_foreign_key': 'R1', - }, - { - 'parent_table_name': 'root2', - 'parent_primary_key': 'R2', - 'child_table_name': 'grandparent', - 'child_foreign_key': 'R2', - }, - { - 'parent_table_name': 'grandparent', - 'parent_primary_key': 'GP', - 'child_table_name': 'parent', - 'child_foreign_key': 'GP', - }, - ], - }) - synthesizer = HMASynthesizer(metadata) - synthesizer._finalize = Mock(return_value=data) - - # Run estimation - estimated_num_columns = synthesizer._estimate_num_columns(metadata) - - # Run actual modeling - synthesizer.fit(data) - synthesizer.sample() - - # Assert estimated number of columns is correct - tables = synthesizer._finalize.call_args[0][0] - for table_name, table in tables.items(): - # Subract all the id columns present in the data, as those are not estimated - num_table_cols = len(table.columns) - if table_name in {'parent', 'grandparent'}: - num_table_cols -= 3 - if table_name in {'root1', 'root2'}: - num_table_cols -= 1 - - assert num_table_cols == estimated_num_columns[table_name]