move unit tests to integration tests

sdv-dev · Aug 6, 2024 · 618f003 · 618f003
1 parent 0bda009
commit 618f003
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 108 deletions.
diff --git a/tests/integration/multi_table/test_hma.py b/tests/integration/multi_table/test_hma.py
@@ -4,6 +4,7 @@
 import math
 import re
 import warnings
+from unittest.mock import Mock
 
 import faker
 import numpy as np
@@ -2019,3 +2020,112 @@ def test_fit_int_primary_key_regex_includes_zero(regex):
     )
     with pytest.raises(SynthesizerInputError, match=message):
         instance.fit(data)
+
+
+def test__estimate_num_columns_to_be_modeled_various_sdtypes():
+    """Test the estimated number of columns is correct for various sdtypes.
+
+    To check that the number columns is correct we Mock the ``_finalize`` method
+    and compare its output with the estimated number of columns.
+
+    The dataset used follows the structure below:
+        R1 R2
+        | /
+        GP
+        |
+        P
+    """
+    # Setup
+    root1 = pd.DataFrame({'R1': [0, 1, 2]})
+    root2 = pd.DataFrame({'R2': [0, 1, 2], 'data': [0, 1, 2]})
+    grandparent = pd.DataFrame({'GP': [0, 1, 2], 'R1': [0, 1, 2], 'R2': [0, 1, 2]})
+    parent = pd.DataFrame({
+        'P': [0, 1, 2],
+        'GP': [0, 1, 2],
+        'numerical': [0.1, 0.5, np.nan],
+        'categorical': ['a', np.nan, 'c'],
+        'datetime': [None, '2019-01-02', '2019-01-03'],
+        'boolean': [float('nan'), False, True],
+        'id': [0, 1, 2],
+    })
+    data = {
+        'root1': root1,
+        'root2': root2,
+        'grandparent': grandparent,
+        'parent': parent,
+    }
+    metadata = MultiTableMetadata.load_from_dict({
+        'tables': {
+            'root1': {
+                'primary_key': 'R1',
+                'columns': {
+                    'R1': {'sdtype': 'id'},
+                },
+            },
+            'root2': {
+                'primary_key': 'R2',
+                'columns': {'R2': {'sdtype': 'id'}, 'data': {'sdtype': 'numerical'}},
+            },
+            'grandparent': {
+                'primary_key': 'GP',
+                'columns': {
+                    'GP': {'sdtype': 'id'},
+                    'R1': {'sdtype': 'id'},
+                    'R2': {'sdtype': 'id'},
+                },
+            },
+            'parent': {
+                'primary_key': 'P',
+                'columns': {
+                    'P': {'sdtype': 'id'},
+                    'GP': {'sdtype': 'id'},
+                    'numerical': {'sdtype': 'numerical'},
+                    'categorical': {'sdtype': 'categorical'},
+                    'datetime': {'sdtype': 'datetime'},
+                    'boolean': {'sdtype': 'boolean'},
+                    'id': {'sdtype': 'id'},
+                },
+            },
+        },
+        'relationships': [
+            {
+                'parent_table_name': 'root1',
+                'parent_primary_key': 'R1',
+                'child_table_name': 'grandparent',
+                'child_foreign_key': 'R1',
+            },
+            {
+                'parent_table_name': 'root2',
+                'parent_primary_key': 'R2',
+                'child_table_name': 'grandparent',
+                'child_foreign_key': 'R2',
+            },
+            {
+                'parent_table_name': 'grandparent',
+                'parent_primary_key': 'GP',
+                'child_table_name': 'parent',
+                'child_foreign_key': 'GP',
+            },
+        ],
+    })
+    synthesizer = HMASynthesizer(metadata)
+    synthesizer._finalize = Mock(return_value=data)
+
+    # Run estimation
+    estimated_num_columns = synthesizer._estimate_num_columns(metadata)
+
+    # Run actual modeling
+    synthesizer.fit(data)
+    synthesizer.sample()
+
+    # Assert estimated number of columns is correct
+    tables = synthesizer._finalize.call_args[0][0]
+    for table_name, table in tables.items():
+        # Subract all the id columns present in the data, as those are not estimated
+        num_table_cols = len(table.columns)
+        if table_name in {'parent', 'grandparent'}:
+            num_table_cols -= 3
+        if table_name in {'root1', 'root2'}:
+            num_table_cols -= 1
+
+        assert num_table_cols == estimated_num_columns[table_name]
diff --git a/tests/unit/multi_table/test_hma.py b/tests/unit/multi_table/test_hma.py
@@ -1119,111 +1119,3 @@ def test__estimate_num_columns_to_be_modeled(self):
                 num_table_cols -= 1
 
             assert num_table_cols == estimated_num_columns[table_name]
-
-    def test__estimate_num_columns_to_be_modeled_various_sdtypes(self):
-        """Test the estimated number of columns is correct for various sdtypes.
-
-        To check that the number columns is correct we Mock the ``_finalize`` method
-        and compare its output with the estimated number of columns.
-
-        The dataset used follows the structure below:
-            R1 R2
-            | /
-            GP
-            |
-            P
-        """
-        # Setup
-        root1 = pd.DataFrame({'R1': [0, 1, 2]})
-        root2 = pd.DataFrame({'R2': [0, 1, 2], 'data': [0, 1, 2]})
-        grandparent = pd.DataFrame({'GP': [0, 1, 2], 'R1': [0, 1, 2], 'R2': [0, 1, 2]})
-        parent = pd.DataFrame({
-            'P': [0, 1, 2],
-            'GP': [0, 1, 2],
-            'numerical': [0.1, 0.5, np.nan],
-            'categorical': ['a', np.nan, 'c'],
-            'datetime': [None, '2019-01-02', '2019-01-03'],
-            'boolean': [float('nan'), False, True],
-            'id': [0, 1, 2],
-        })
-        data = {
-            'root1': root1,
-            'root2': root2,
-            'grandparent': grandparent,
-            'parent': parent,
-        }
-        metadata = MultiTableMetadata.load_from_dict({
-            'tables': {
-                'root1': {
-                    'primary_key': 'R1',
-                    'columns': {
-                        'R1': {'sdtype': 'id'},
-                    },
-                },
-                'root2': {
-                    'primary_key': 'R2',
-                    'columns': {'R2': {'sdtype': 'id'}, 'data': {'sdtype': 'numerical'}},
-                },
-                'grandparent': {
-                    'primary_key': 'GP',
-                    'columns': {
-                        'GP': {'sdtype': 'id'},
-                        'R1': {'sdtype': 'id'},
-                        'R2': {'sdtype': 'id'},
-                    },
-                },
-                'parent': {
-                    'primary_key': 'P',
-                    'columns': {
-                        'P': {'sdtype': 'id'},
-                        'GP': {'sdtype': 'id'},
-                        'numerical': {'sdtype': 'numerical'},
-                        'categorical': {'sdtype': 'categorical'},
-                        'datetime': {'sdtype': 'datetime'},
-                        'boolean': {'sdtype': 'boolean'},
-                        'id': {'sdtype': 'id'},
-                    },
-                },
-            },
-            'relationships': [
-                {
-                    'parent_table_name': 'root1',
-                    'parent_primary_key': 'R1',
-                    'child_table_name': 'grandparent',
-                    'child_foreign_key': 'R1',
-                },
-                {
-                    'parent_table_name': 'root2',
-                    'parent_primary_key': 'R2',
-                    'child_table_name': 'grandparent',
-                    'child_foreign_key': 'R2',
-                },
-                {
-                    'parent_table_name': 'grandparent',
-                    'parent_primary_key': 'GP',
-                    'child_table_name': 'parent',
-                    'child_foreign_key': 'GP',
-                },
-            ],
-        })
-        synthesizer = HMASynthesizer(metadata)
-        synthesizer._finalize = Mock(return_value=data)
-
-        # Run estimation
-        estimated_num_columns = synthesizer._estimate_num_columns(metadata)
-
-        # Run actual modeling
-        synthesizer.fit(data)
-        synthesizer.sample()
-
-        # Assert estimated number of columns is correct
-        tables = synthesizer._finalize.call_args[0][0]
-        for table_name, table in tables.items():
-            # Subract all the id columns present in the data, as those are not estimated
-            num_table_cols = len(table.columns)
-            if table_name in {'parent', 'grandparent'}:
-                num_table_cols -= 3
-            if table_name in {'root1', 'root2'}:
-                num_table_cols -= 1
-
-            assert num_table_cols == estimated_num_columns[table_name]