From c9b1ff2dc05cb5772a323849a4c62f34c226966d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 5 Apr 2024 18:48:18 +0200 Subject: [PATCH] Add warnings for datetimes --- sdv/metadata/multi_table.py | 24 +++++++++++++++++++++++- sdv/metadata/single_table.py | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 1cf48b587..b108dafa4 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -19,6 +19,7 @@ create_columns_node, create_summarized_columns_node, visualize_graph) LOGGER = logging.getLogger(__name__) +WARNINGS_COLUMN_ORDER = ['Table Name', 'Column Name', 'sdtype', 'datetime_format'] class MultiTableMetadata: @@ -694,6 +695,7 @@ def _validate_single_table(self, errors): errors.append(error_message) try: table.validate() + except Exception as error: errors.append('\n') title = f'Table: {table_name}' @@ -753,9 +755,12 @@ def _validate_missing_tables(self, data): def _validate_all_tables(self, data): """Validate every table of the data has a valid table/metadata pair.""" errors = [] + warning_dataframes = [] for table_name, table_data in data.items(): + table_sdtype_warnings = defaultdict(list) try: - self.tables[table_name].validate_data(table_data) + with warnings.catch_warnings(record=True): + self.tables[table_name].validate_data(table_data, table_sdtype_warnings) except InvalidDataError as error: error_msg = f"Table: '{table_name}'" @@ -770,6 +775,23 @@ def _validate_all_tables(self, data): except KeyError: continue + finally: + if table_sdtype_warnings: + table_sdtype_warnings['Table Name'].extend( + [table_name] * len(table_sdtype_warnings['Column Name']) + ) + df = pd.DataFrame(table_sdtype_warnings, columns=WARNINGS_COLUMN_ORDER) + warning_dataframes.append(df) + + if warning_dataframes: + df = pd.concat(warning_dataframes) + warnings.warn( + "No 'datetime_format' is present in the metadata for the following columns:\n " + f'{df.to_string(index=False)}\n\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + return errors def _validate_foreign_keys(self, data): diff --git a/sdv/metadata/single_table.py b/sdv/metadata/single_table.py index 2d16205ee..3080ace1a 100644 --- a/sdv/metadata/single_table.py +++ b/sdv/metadata/single_table.py @@ -4,6 +4,7 @@ import logging import re import warnings +from collections import defaultdict from copy import deepcopy from datetime import datetime @@ -1030,8 +1031,16 @@ def _get_invalid_column_values(column, validation_function): return set(column[~valid]) - def _validate_column_data(self, column): - """Validate values of the column satisfy its sdtype properties.""" + def _validate_column_data(self, column, sdtype_warnings): + """Validate values of the column satisfy its sdtype properties. + + Args: + column (pd.Series): + The data to validate against. + sdtype_warnings (defaultdict[list]): + A ``defaultdict`` with ``list`` to add warning messages to. + + """ column_metadata = self.columns[column.name] sdtype = column_metadata['sdtype'] invalid_values = None @@ -1060,13 +1069,18 @@ def _validate_column_data(self, column): lambda x: pd.isna(x) | _is_datetime_type(x) ) + if datetime_format is None and column.dtype == 'O': + sdtype_warnings['Column Name'].append(column.name) + sdtype_warnings['sdtype'].append(sdtype) + sdtype_warnings['datetime_format'].append(datetime_format) + if invalid_values: invalid_values = _format_invalid_values_string(invalid_values, 3) return [f"Invalid values found for {sdtype} column '{column.name}': {invalid_values}."] return [] - def validate_data(self, data): + def validate_data(self, data, sdtype_warnings=None): """Validate the data matches the metadata. Checks the metadata follows the following rules: @@ -1079,6 +1093,7 @@ def validate_data(self, data): data (pd.DataFrame): The data to validate. """ + sdtype_warnings = sdtype_warnings if sdtype_warnings is not None else defaultdict(list) if not isinstance(data, pd.DataFrame): raise ValueError(f'Data must be a DataFrame, not a {type(data)}.') @@ -1094,7 +1109,18 @@ def validate_data(self, data): # Every column must satisfy the properties of their sdtypes for column in data: - errors += self._validate_column_data(data[column]) + errors += self._validate_column_data(data[column], sdtype_warnings) + + if sdtype_warnings is not None and len(sdtype_warnings): + df = pd.DataFrame(sdtype_warnings) + message = ( + "No 'datetime_format' is present in the metadata for the following columns:\n" + f'{df.to_string(index=False)}\n' + 'Without this specification, SDV may not be able to accurately parse the data. ' + "We recommend adding datetime formats using 'update_column'." + ) + + warnings.warn(message) if errors: raise InvalidDataError(errors)