From ed52974218819f4dc37df494f6aa3a11620ac66b Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:34:27 +0200 Subject: [PATCH] Add warning for `datetimes` when they are missing `datetime_format` and are represented as `dtype` object (#1897) --- sdv/metadata/multi_table.py | 33 ++++++++++++++- sdv/metadata/single_table.py | 53 ++++++++++++++++++++++-- tests/unit/metadata/test_multi_table.py | 53 ++++++++++++++++++++++++ tests/unit/metadata/test_single_table.py | 50 ++++++++++++++++++++++ 4 files changed, 184 insertions(+), 5 deletions(-) diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 1cf48b587..194dc7ba7 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -19,6 +19,7 @@ create_columns_node, create_summarized_columns_node, visualize_graph) LOGGER = logging.getLogger(__name__) +WARNINGS_COLUMN_ORDER = ['Table Name', 'Column Name', 'sdtype', 'datetime_format'] class MultiTableMetadata: @@ -694,6 +695,7 @@ def _validate_single_table(self, errors): errors.append(error_message) try: table.validate() + except Exception as error: errors.append('\n') title = f'Table: {table_name}' @@ -753,9 +755,12 @@ def _validate_missing_tables(self, data): def _validate_all_tables(self, data): """Validate every table of the data has a valid table/metadata pair.""" errors = [] + warning_dataframes = [] for table_name, table_data in data.items(): + table_sdtype_warnings = defaultdict(list) try: - self.tables[table_name].validate_data(table_data) + with warnings.catch_warnings(record=True): + self.tables[table_name].validate_data(table_data, table_sdtype_warnings) except InvalidDataError as error: error_msg = f"Table: '{table_name}'" @@ -770,6 +775,24 @@ def _validate_all_tables(self, data): except KeyError: continue + finally: + if table_sdtype_warnings: + table_sdtype_warnings['Table Name'].extend( + [table_name] * len(table_sdtype_warnings['Column Name']) + ) + df = pd.DataFrame(table_sdtype_warnings, columns=WARNINGS_COLUMN_ORDER) + warning_dataframes.append(df) + + if warning_dataframes: + warning_df = pd.concat(warning_dataframes) + warning_msg = ( + "No 'datetime_format' is present in the metadata for the following columns:\n " + f'{warning_df.to_string(index=False)}\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + warnings.warn(warning_msg) + return errors def _validate_foreign_keys(self, data): @@ -816,6 +839,14 @@ def validate_data(self, data): Args: data (pd.DataFrame): The data to validate. + + Raises: + InvalidDataError: + This error is being raised if the data is not matching its sdtype requirements. + + Warns: + A warning is being raised if ``datetime_format`` is missing from a column represented + as ``object`` in the dataframe and its sdtype is ``datetime``. """ errors = [] errors += self._validate_missing_tables(data) diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 5707a9b46..ce81ad3a0 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -4,6 +4,7 @@ import logging import re import warnings +from collections import defaultdict from copy import deepcopy from datetime import datetime @@ -1034,8 +1035,24 @@ def _get_invalid_column_values(column, validation_function): return set(column[~valid]) - def _validate_column_data(self, column): - """Validate values of the column satisfy its sdtype properties.""" + def _validate_column_data(self, column, sdtype_warnings): + """Validate the values of the given column against its specified sdtype properties. + + The function checks the sdtype of the column and validates the data accordingly. If there + are any errors, those are being appended to a list of errors that will be returned. + Additionally ``sdtype_warnings`` is being updated with ``datetime_format`` warnings + to be raised later. + + Args: + column (pd.Series): + The data to validate against. + sdtype_warnings (defaultdict[list]): + A ``defaultdict`` with ``list`` to add warning messages. + + Returns: + list: + A list containing any validation error messages found during the process. + """ column_metadata = self.columns[column.name] sdtype = column_metadata['sdtype'] invalid_values = None @@ -1064,13 +1081,18 @@ def _validate_column_data(self, column): lambda x: pd.isna(x) | _is_datetime_type(x) ) + if datetime_format is None and column.dtype == 'O': + sdtype_warnings['Column Name'].append(column.name) + sdtype_warnings['sdtype'].append(sdtype) + sdtype_warnings['datetime_format'].append(datetime_format) + if invalid_values: invalid_values = _format_invalid_values_string(invalid_values, 3) return [f"Invalid values found for {sdtype} column '{column.name}': {invalid_values}."] return [] - def validate_data(self, data): + def validate_data(self, data, sdtype_warnings=None): """Validate the data matches the metadata. Checks the metadata follows the following rules: @@ -1078,11 +1100,23 @@ def validate_data(self, data): * keys don't have missing values * primary or alternate keys are unique * values of a column satisfy their sdtype + * datetimes represented as objects have ``datetime_format`` (warning only). Args: data (pd.DataFrame): The data to validate. + sdtype_warnings (defaultdict[list] or None): + A ``defaultdict`` with ``list`` to add warning messages. + + Raises: + InvalidDataError: + This error is being raised if the data is not matching its sdtype requirements. + + Warns: + A warning is being raised if ``datetime_format`` is missing from a column represented + as ``object`` in the dataframe and its sdtype is ``datetime``. """ + sdtype_warnings = sdtype_warnings if sdtype_warnings is not None else defaultdict(list) if not isinstance(data, pd.DataFrame): raise ValueError(f'Data must be a DataFrame, not a {type(data)}.') @@ -1098,7 +1132,18 @@ def validate_data(self, data): # Every column must satisfy the properties of their sdtypes for column in data: - errors += self._validate_column_data(data[column]) + errors += self._validate_column_data(data[column], sdtype_warnings) + + if sdtype_warnings is not None and len(sdtype_warnings): + df = pd.DataFrame(sdtype_warnings) + message = ( + "No 'datetime_format' is present in the metadata for the following columns:\n" + f'{df.to_string(index=False)}\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + + warnings.warn(message) if errors: raise InvalidDataError(errors) diff --git a/tests/unit/metadata/test_multi_table.py b/tests/unit/metadata/test_multi_table.py index e2b1e24d4..5ff18635e 100644 --- a/tests/unit/metadata/test_multi_table.py +++ b/tests/unit/metadata/test_multi_table.py @@ -1408,6 +1408,59 @@ def test_validate_data_missing_foreign_keys(self): with pytest.raises(InvalidDataError, match=error_msg): metadata.validate_data(data) + def test_validate_data_datetime_warning(self): + """Test validation for columns with datetime. + + If the datetime format is not provided, a warning should be shwon if the ``dtype`` is + object. + """ + # Setup + metadata = get_multi_table_metadata() + data = get_multi_table_data() + + data['upravna_enota']['warning_date_str'] = [ + '2022-09-02', + '2022-09-16', + '2022-08-26', + '2022-08-26' + ] + data['upravna_enota']['valid_date'] = [ + '20220902110443000000', + '20220916230356000000', + '20220826173917000000', + '20220929111311000000' + ] + data['upravna_enota']['datetime'] = pd.to_datetime([ + '20220902', + '20220916', + '20220826', + '20220826' + ]) + metadata.add_column('upravna_enota', 'warning_date_str', sdtype='datetime') + metadata.add_column( + 'upravna_enota', + 'valid_date', + sdtype='datetime', + datetime_format='%Y%m%d%H%M%S%f' + ) + metadata.add_column('upravna_enota', 'datetime', sdtype='datetime') + + # Run and Assert + warning_df = pd.DataFrame({ + 'Table Name': ['upravna_enota'], + 'Column Name': ['warning_date_str'], + 'sdtype': ['datetime'], + 'datetime_format': [None] + }) + warning_msg = ( + "No 'datetime_format' is present in the metadata for the following columns:\n " + f'{warning_df.to_string(index=False)}\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + with pytest.warns(UserWarning, match=warning_msg): + metadata.validate_data(data) + @patch('sdv.metadata.multi_table.SingleTableMetadata') def test_add_table(self, table_metadata_mock): """Test that the method adds the table name to ``instance.tables``.""" diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 5b2234f9d..1b41414b0 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -2538,6 +2538,56 @@ def test_validate_data_datetime_sdtype(self): with pytest.raises(InvalidDataError, match=err_msg): metadata.validate_data(data) + def test_validate_data_datetime_warning(self): + """Test validation for columns with datetime. + + If the datetime format is not provided, a warning should be shwon if the ``dtype`` is + object. + """ + # Setup + data = pd.DataFrame({ + 'warning_date_str': [ + '2022-09-02', + '2022-09-16', + '2022-08-26', + '2022-08-26', + '2022-09-29' + ], + 'valid_date': [ + '20220902110443000000', + '20220916230356000000', + '20220826173917000000', + '20220826212135000000', + '20220929111311000000' + ], + 'datetime': pd.to_datetime([ + '20220902', + '20220916', + '20220826', + '20220826', + '20220929' + ]) + }) + metadata = SingleTableMetadata() + metadata.add_column('warning_date_str', sdtype='datetime') + metadata.add_column('valid_date', sdtype='datetime', datetime_format='%Y%m%d%H%M%S%f') + metadata.add_column('datetime', sdtype='datetime') + + # Run and Assert + warning_frame = pd.DataFrame({ + 'Column Name': ['warning_date_str'], + 'sdtype': ['datetime'], + 'datetime_format': [None] + }) + warning_msg = ( + "No 'datetime_format' is present in the metadata for the following columns:\n" + f'{warning_frame.to_string(index=False)}\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + with pytest.warns(UserWarning, match=warning_msg): + metadata.validate_data(data) + def test_validate_data(self): """Test the method doesn't crash when the passed data is valid.