From 00cea6d8e835aff55bfe04b09495d0cb28ab0a6e Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Tue, 14 May 2024 14:58:51 -0400 Subject: [PATCH] Update logs to write to CSV --- sdv/logging/__init__.py | 4 +- sdv/logging/sdv_logger_config.yml | 8 +- sdv/logging/utils.py | 15 ++++ sdv/multi_table/base.py | 125 +++++++++++---------------- sdv/single_table/base.py | 118 ++++++++++--------------- tests/unit/logging/test_utils.py | 36 +++++++- tests/unit/multi_table/test_base.py | 92 ++++++++++---------- tests/unit/single_table/test_base.py | 92 ++++++++++---------- 8 files changed, 243 insertions(+), 247 deletions(-) diff --git a/sdv/logging/__init__.py b/sdv/logging/__init__.py index c15348231..2c5d10e88 100644 --- a/sdv/logging/__init__.py +++ b/sdv/logging/__init__.py @@ -1,10 +1,12 @@ """Module for configuring loggers within the SDV library.""" from sdv.logging.logger import get_sdv_logger -from sdv.logging.utils import disable_single_table_logger, get_sdv_logger_config +from sdv.logging.utils import ( + disable_single_table_logger, get_sdv_logger_config, load_logfile_dataframe) __all__ = ( 'disable_single_table_logger', 'get_sdv_logger', 'get_sdv_logger_config', + 'load_logfile_dataframe' ) diff --git a/sdv/logging/sdv_logger_config.yml b/sdv/logging/sdv_logger_config.yml index 4b01b0c65..cf3b51710 100644 --- a/sdv/logging/sdv_logger_config.yml +++ b/sdv/logging/sdv_logger_config.yml @@ -6,22 +6,22 @@ loggers: propagate: false handlers: class: logging.FileHandler - filename: sdv_logs.log + filename: sdv_logs.csv MultiTableSynthesizer: level: INFO propagate: false handlers: class: logging.FileHandler - filename: sdv_logs.log + filename: sdv_logs.csv MultiTableMetadata: level: INFO propagate: false handlers: class: logging.FileHandler - filename: sdv_logs.log + filename: sdv_logs.csv SingleTableMetadata: level: INFO propagate: false handlers: class: logging.FileHandler - filename: sdv_logs.log + filename: sdv_logs.csv diff --git a/sdv/logging/utils.py b/sdv/logging/utils.py index 471870649..acea37bd7 100644 --- a/sdv/logging/utils.py +++ b/sdv/logging/utils.py @@ -5,6 +5,7 @@ import shutil from pathlib import Path +import pandas as pd import platformdirs import yaml @@ -49,3 +50,17 @@ def disable_single_table_logger(): finally: for handler in handlers: single_table_logger.addHandler(handler) + + +def load_logfile_dataframe(logfile): + """Load the SDV logfile as a pandas DataFrame with correct column headers. + + Args: + logfile (str): + Path to the SDV log CSV file. + """ + column_names = [ + 'LEVEL', 'EVENT', 'TIMESTAMP', 'SYNTHESIZER CLASS NAME', 'SYNTHESIZER ID', + 'TOTAL NUMBER OF TABLES', 'TOTAL NUMBER OF ROWS', 'TOTAL NUMBER OF COLUMNS' + ] + return pd.read_csv(logfile, names=column_names) diff --git a/sdv/multi_table/base.py b/sdv/multi_table/base.py index 779366cb1..ed08891fb 100644 --- a/sdv/multi_table/base.py +++ b/sdv/multi_table/base.py @@ -119,15 +119,12 @@ def __init__(self, metadata, locales=['en_US'], synthesizer_kwargs=None): self._fitted_sdv_version = None self._fitted_sdv_enterprise_version = None self._synthesizer_id = generate_synthesizer_id(self) - SYNTHESIZER_LOGGER.info( - '\nInstance:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - self._synthesizer_id - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Instance', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id + }) def set_address_columns(self, table_name, column_names, anonymization_level='full'): """Set the address multi-column transformer. @@ -403,22 +400,16 @@ def fit_processed_data(self, processed_data): total_rows += len(table) total_columns += len(table.columns) - SYNTHESIZER_LOGGER.info( - '\nFit processed data:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the fit processed data:\n' - ' Total number of tables: %s\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - len(processed_data), - total_rows, - total_columns, - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Fit processed data', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': len(processed_data), + 'TOTAL NUMBER OF ROWS': total_rows, + 'TOTAL NUMBER OF COLUMNS': total_columns + }) + check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt) with disable_single_table_logger(): augmented_data = self._augment_tables(processed_data) @@ -443,22 +434,16 @@ def fit(self, data): total_rows += len(table) total_columns += len(table.columns) - SYNTHESIZER_LOGGER.info( - '\nFit:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the fit data:\n' - ' Total number of tables: %s\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - len(data), - total_rows, - total_columns, - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Fit', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': len(data), + 'TOTAL NUMBER OF ROWS': total_rows, + 'TOTAL NUMBER OF COLUMNS': total_columns + }) + check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt) _validate_foreign_keys_not_null(self.metadata, data) self._check_metadata_updated() @@ -511,22 +496,16 @@ def sample(self, scale=1.0): if table in table_columns: sampled_data[table].columns = table_columns[table] - SYNTHESIZER_LOGGER.info( - '\nSample:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the sample size:\n' - ' Total number of tables: %s\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - len(sampled_data), - total_rows, - total_columns, - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Sample', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': len(sampled_data), + 'TOTAL NUMBER OF ROWS': total_rows, + 'TOTAL NUMBER OF COLUMNS': total_columns + }) + return sampled_data def get_learned_distributions(self, table_name): @@ -692,15 +671,13 @@ def save(self, filepath): Path where the instance will be serialized. """ synthesizer_id = getattr(self, '_synthesizer_id', None) - SYNTHESIZER_LOGGER.info( - '\nSave:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - synthesizer_id - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Save', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': synthesizer_id, + }) + with open(filepath, 'wb') as output: cloudpickle.dump(self, output) @@ -724,13 +701,11 @@ def load(cls, filepath): if getattr(synthesizer, '_synthesizer_id', None) is None: synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer) - SYNTHESIZER_LOGGER.info( - '\nLoad:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - synthesizer.__class__.__name__, - synthesizer._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Load', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': synthesizer.__class__.__name__, + 'SYNTHESIZER ID': synthesizer._synthesizer_id, + }) + return synthesizer diff --git a/sdv/single_table/base.py b/sdv/single_table/base.py index 3b3b93122..de474d1c7 100644 --- a/sdv/single_table/base.py +++ b/sdv/single_table/base.py @@ -112,15 +112,12 @@ def __init__(self, metadata, enforce_min_max_values=True, enforce_rounding=True, self._fitted_sdv_version = None self._fitted_sdv_enterprise_version = None self._synthesizer_id = generate_synthesizer_id(self) - SYNTHESIZER_LOGGER.info( - '\nInstance:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - self._synthesizer_id - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Instance', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + }) def set_address_columns(self, column_names, anonymization_level='full'): """Set the address multi-column transformer.""" @@ -420,21 +417,15 @@ def fit_processed_data(self, processed_data): processed_data (pandas.DataFrame): The transformed data used to fit the model to. """ - SYNTHESIZER_LOGGER.info( - '\nFit processed data:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the fit processed data:\n' - ' Total number of tables: 1\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - len(processed_data), - len(processed_data.columns), - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Fit processed data', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': len(processed_data), + 'TOTAL NUMBER OF COLUMNS': len(processed_data.columns) + }) check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt) if not processed_data.empty: @@ -452,21 +443,15 @@ def fit(self, data): data (pandas.DataFrame): The raw data (before any transformations) to fit the model to. """ - SYNTHESIZER_LOGGER.info( - '\nFit:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the fit data:\n' - ' Total number of tables: 1\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - len(data), - len(data.columns), - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Fit', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': len(data), + 'TOTAL NUMBER OF COLUMNS': len(data.columns) + }) check_synthesizer_version(self, is_fit_method=True, compare_operator=operator.lt) self._check_metadata_updated() @@ -484,15 +469,12 @@ def save(self, filepath): Path where the synthesizer instance will be serialized. """ synthesizer_id = getattr(self, '_synthesizer_id', None) - SYNTHESIZER_LOGGER.info( - '\nSave:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - self.__class__.__name__, - synthesizer_id - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Save', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': synthesizer_id, + }) with open(filepath, 'wb') as output: cloudpickle.dump(self, output) @@ -517,15 +499,12 @@ def load(cls, filepath): if getattr(synthesizer, '_synthesizer_id', None) is None: synthesizer._synthesizer_id = generate_synthesizer_id(synthesizer) - SYNTHESIZER_LOGGER.info( - '\nLoad:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Synthesizer id: %s', - datetime.datetime.now(), - synthesizer.__class__.__name__, - synthesizer._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Load', + 'TIMESTAMP': datetime.datetime.now(), + 'SYNTHESIZER CLASS NAME': synthesizer.__class__.__name__, + 'SYNTHESIZER ID': synthesizer._synthesizer_id, + }) return synthesizer @@ -913,21 +892,16 @@ def sample(self, num_rows, max_tries_per_batch=100, batch_size=None, output_file if not original_columns.empty: sampled_data.columns = self._original_columns - SYNTHESIZER_LOGGER.info( - '\nSample:\n' - ' Timestamp: %s\n' - ' Synthesizer class name: %s\n' - ' Statistics of the sample size:\n' - ' Total number of tables: 1\n' - ' Total number of rows: %s\n' - ' Total number of columns: %s\n' - ' Synthesizer id: %s', - sample_timestamp, - self.__class__.__name__, - len(sampled_data), - len(sampled_data.columns), - self._synthesizer_id, - ) + SYNTHESIZER_LOGGER.info({ + 'EVENT': 'Sample', + 'TIMESTAMP': sample_timestamp, + 'SYNTHESIZER CLASS NAME': self.__class__.__name__, + 'SYNTHESIZER ID': self._synthesizer_id, + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': len(sampled_data), + 'TOTAL NUMBER OF COLUMNS': len(sampled_data.columns) + + }) return sampled_data diff --git a/tests/unit/logging/test_utils.py b/tests/unit/logging/test_utils.py index b585cb880..1bcf74788 100644 --- a/tests/unit/logging/test_utils.py +++ b/tests/unit/logging/test_utils.py @@ -1,7 +1,12 @@ """Test ``SDV`` logging utilities.""" +from io import StringIO from unittest.mock import Mock, mock_open, patch -from sdv.logging.utils import disable_single_table_logger, get_sdv_logger_config +import numpy as np +import pandas as pd + +from sdv.logging.utils import ( + disable_single_table_logger, get_sdv_logger_config, load_logfile_dataframe) def test_get_sdv_logger_config(): @@ -54,3 +59,32 @@ def test_disable_single_table_logger(mock_getlogger): # Assert assert len(mock_logger.handlers) == 1 + + +def test_load_logfile_dataframe(): + """Test loading the CSV logfile into a DataFrame""" + # Setup + logfile = StringIO( + 'INFO,Instance,2024-05-14 11:29:00.649735,GaussianCopulaSynthesizer,' + 'GaussianCopulaSynthesizer_1.12.1_5387a6e9f4d,,,\n' + 'INFO,Fit,2024-05-14 11:29:00.649735,GaussianCopulaSynthesizer,' + 'GaussianCopulaSynthesizer_1.12.1_5387a6e9f4d,1,500,9\n' + 'INFO,Sample,2024-05-14 11:29:00.649735,GaussianCopulaSynthesizer,' + 'GaussianCopulaSynthesizer_1.12.1_5387a6e9f4d,1,500,6\n' + ) + + # Run + log_dataframe = load_logfile_dataframe(logfile) + + # Assert + expected_log = pd.DataFrame({ + 'LEVEL': ['INFO'] * 3, + 'EVENT': ['Instance', 'Fit', 'Sample'], + 'TIMESTAMP': ['2024-05-14 11:29:00.649735'] * 3, + 'SYNTHESIZER CLASS NAME': ['GaussianCopulaSynthesizer'] * 3, + 'SYNTHESIZER ID': ['GaussianCopulaSynthesizer_1.12.1_5387a6e9f4d'] * 3, + 'TOTAL NUMBER OF TABLES': [np.nan, 1, 1], + 'TOTAL NUMBER OF ROWS': [np.nan, 500, 500], + 'TOTAL NUMBER OF COLUMNS': [np.nan, 9, 6] + }) + pd.testing.assert_frame_equal(log_dataframe, expected_log) diff --git a/tests/unit/multi_table/test_base.py b/tests/unit/multi_table/test_base.py index ffcd63148..c4fe83d6e 100644 --- a/tests/unit/multi_table/test_base.py +++ b/tests/unit/multi_table/test_base.py @@ -133,11 +133,12 @@ def test___init__(self, mock_check_metadata_updated, mock_generate_synthesizer_i mock_check_metadata_updated.assert_called_once() mock_generate_synthesizer_id.assert_called_once_with(instance) assert instance._synthesizer_id == synthesizer_id - assert caplog.messages[0] == ( - '\nInstance:\n Timestamp: 2024-04-19 16:20:10.037183\n Synthesizer class name: ' - 'BaseMultiTableSynthesizer\n Synthesizer id: ' - 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Instance', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'BaseMultiTableSynthesizer', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' + }) def test__init__column_relationship_warning(self): """Test that a warning is raised only once when the metadata has column relationships.""" @@ -927,16 +928,15 @@ def test_fit_processed_data(self, mock_datetime, caplog): instance._augment_tables.assert_called_once_with(processed_data) instance._model_tables.assert_called_once_with(instance._augment_tables.return_value) assert instance._fitted - assert caplog.messages[0] == ( - '\nFit processed data:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Statistics of the fit processed data:\n' - ' Total number of tables: 2\n' - ' Total number of rows: 6\n' - ' Total number of columns: 4\n' - ' Synthesizer id: BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Fit processed data', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 2, + 'TOTAL NUMBER OF ROWS': 6, + 'TOTAL NUMBER OF COLUMNS': 4 + }) def test_fit_processed_data_empty_table(self): """Test attributes are properly set when data is empty and that _fit is not called.""" @@ -1012,16 +1012,15 @@ def test_fit(self, mock_validate_foreign_keys_not_null, mock_datetime, caplog): instance.preprocess.assert_called_once_with(data) instance.fit_processed_data.assert_called_once_with(instance.preprocess.return_value) instance._check_metadata_updated.assert_called_once() - assert caplog.messages[0] == ( - '\nFit:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Statistics of the fit data:\n' - ' Total number of tables: 2\n' - ' Total number of rows: 6\n' - ' Total number of columns: 4\n' - ' Synthesizer id: BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Fit', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 2, + 'TOTAL NUMBER OF ROWS': 6, + 'TOTAL NUMBER OF COLUMNS': 4 + }) def test_fit_raises_version_error(self): """Test that fit will raise a ``VersionError`` if the current version is bigger.""" @@ -1148,16 +1147,15 @@ def test_sample(self, mock_datetime, caplog): # Assert instance._sample.assert_called_once_with(scale=1.5) - assert caplog.messages[0] == ( - '\nSample:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: BaseMultiTableSynthesizer\n' - ' Statistics of the sample size:\n' - ' Total number of tables: 2\n' - ' Total number of rows: 6\n' - ' Total number of columns: 4\n' - ' Synthesizer id: BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Sample', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'BaseMultiTableSynthesizer', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 2, + 'TOTAL NUMBER OF ROWS': 6, + 'TOTAL NUMBER OF COLUMNS': 4 + }) def test_get_learned_distributions_raises_an_unfitted_error(self): """Test that ``get_learned_distributions`` raises an error when model is not fitted.""" @@ -1563,12 +1561,12 @@ def test_save(self, cloudpickle_mock, mock_datetime, tmp_path, caplog): # Assert cloudpickle_mock.dump.assert_called_once_with(synthesizer, ANY) - assert caplog.messages[0] == ( - '\nSave:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Synthesizer id: BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Save', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + }) @patch('sdv.multi_table.base.datetime') @patch('sdv.multi_table.base.generate_synthesizer_id') @@ -1599,9 +1597,9 @@ def test_load(self, mock_file, cloudpickle_mock, mock_check_synthesizer_version.assert_called_once_with(synthesizer_mock) assert loaded_instance._synthesizer_id == synthesizer_id mock_generate_synthesizer_id.assert_called_once_with(synthesizer_mock) - assert caplog.messages[0] == ( - '\nLoad:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Synthesizer id: BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Load', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseMultiTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + }) diff --git a/tests/unit/single_table/test_base.py b/tests/unit/single_table/test_base.py index dbaf35018..3074d8506 100644 --- a/tests/unit/single_table/test_base.py +++ b/tests/unit/single_table/test_base.py @@ -94,11 +94,12 @@ def test___init__(self, mock_check_metadata_updated, mock_data_processor, metadata.validate.assert_called_once_with() mock_check_metadata_updated.assert_called_once() mock_generate_synthesizer_id.assert_called_once_with(instance) - assert caplog.messages[0] == ( - '\nInstance:\n Timestamp: 2024-04-19 16:20:10.037183\n Synthesizer class name: ' - 'BaseSingleTableSynthesizer\n Synthesizer id: ' - 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Instance', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'BaseSingleTableSynthesizer', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' + }) @patch('sdv.single_table.base.DataProcessor') def test___init__custom(self, mock_data_processor): @@ -398,16 +399,15 @@ def test_fit_processed_data(self, mock_datetime, caplog): # Assert instance._fit.assert_called_once_with(processed_data) - assert caplog.messages[0] == ( - '\nFit processed data:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Statistics of the fit processed data:\n' - ' Total number of tables: 1\n' - ' Total number of rows: 3\n' - ' Total number of columns: 1\n' - ' Synthesizer id: BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Fit processed data', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': 3, + 'TOTAL NUMBER OF COLUMNS': 1 + }) def test_fit_processed_data_raises_version_error(self): """Test that ``fit`` raises ``VersionError`` @@ -461,16 +461,15 @@ def test_fit(self, mock_datetime, caplog): instance.preprocess.assert_called_once_with(data) instance.fit_processed_data.assert_called_once_with(instance.preprocess.return_value) instance._check_metadata_updated.assert_called_once() - assert caplog.messages[0] == ( - '\nFit:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Statistics of the fit data:\n' - ' Total number of tables: 1\n' - ' Total number of rows: 3\n' - ' Total number of columns: 2\n' - ' Synthesizer id: BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Fit', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': 3, + 'TOTAL NUMBER OF COLUMNS': 2 + }) def test_fit_raises_version_error(self): """Test that ``fit`` raises ``VersionError`` @@ -1476,16 +1475,15 @@ def test_sample(self, mock_datetime, caplog): show_progress_bar=True ) pd.testing.assert_frame_equal(result, pd.DataFrame({'col': [1, 2, 3]})) - assert caplog.messages[0] == ( - '\nSample:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Statistics of the sample size:\n' - ' Total number of tables: 1\n' - ' Total number of rows: 3\n' - ' Total number of columns: 1\n' - ' Synthesizer id: BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Sample', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + 'TOTAL NUMBER OF TABLES': 1, + 'TOTAL NUMBER OF ROWS': 3, + 'TOTAL NUMBER OF COLUMNS': 1 + }) def test__validate_conditions_unseen_columns(self): """Test that conditions are within the ``data_processor`` fields.""" @@ -1855,12 +1853,12 @@ def test_save(self, cloudpickle_mock, mock_datetime, tmp_path, caplog): # Assert cloudpickle_mock.dump.assert_called_once_with(synthesizer, ANY) - assert caplog.messages[0] == ( - '\nSave:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Synthesizer id: BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Save', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + }) @patch('sdv.single_table.base.datetime') @patch('sdv.single_table.base.generate_synthesizer_id') @@ -1891,12 +1889,12 @@ def test_load(self, mock_file, cloudpickle_mock, mock_check_sdv_versions_and_war assert loaded_instance._synthesizer_id == synthesizer_id mock_check_synthesizer_version.assert_called_once_with(synthesizer_mock) mock_generate_synthesizer_id.assert_called_once_with(synthesizer_mock) - assert caplog.messages[0] == ( - '\nLoad:\n' - ' Timestamp: 2024-04-19 16:20:10.037183\n' - ' Synthesizer class name: Mock\n' - ' Synthesizer id: BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5' - ) + assert caplog.messages[0] == str({ + 'EVENT': 'Load', + 'TIMESTAMP': '2024-04-19 16:20:10.037183', + 'SYNTHESIZER CLASS NAME': 'Mock', + 'SYNTHESIZER ID': 'BaseSingleTableSynthesizer_1.0.0_92aff11e9a5649d1a280990d1231a5f5', + }) def test_load_custom_constraint_classes(self): """Test that ``load_custom_constraint_classes`` calls the ``DataProcessor``'s method."""