Skip to content

Commit

Permalink
Add warnings for datetimes
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer committed Apr 5, 2024
1 parent 70d6c8b commit c9b1ff2
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
24 changes: 23 additions & 1 deletion sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
create_columns_node, create_summarized_columns_node, visualize_graph)

LOGGER = logging.getLogger(__name__)
WARNINGS_COLUMN_ORDER = ['Table Name', 'Column Name', 'sdtype', 'datetime_format']


class MultiTableMetadata:
Expand Down Expand Up @@ -694,6 +695,7 @@ def _validate_single_table(self, errors):
errors.append(error_message)
try:
table.validate()

except Exception as error:
errors.append('\n')
title = f'Table: {table_name}'
Expand Down Expand Up @@ -753,9 +755,12 @@ def _validate_missing_tables(self, data):
def _validate_all_tables(self, data):
"""Validate every table of the data has a valid table/metadata pair."""
errors = []
warning_dataframes = []
for table_name, table_data in data.items():
table_sdtype_warnings = defaultdict(list)
try:
self.tables[table_name].validate_data(table_data)
with warnings.catch_warnings(record=True):
self.tables[table_name].validate_data(table_data, table_sdtype_warnings)

except InvalidDataError as error:
error_msg = f"Table: '{table_name}'"
Expand All @@ -770,6 +775,23 @@ def _validate_all_tables(self, data):
except KeyError:
continue

finally:
if table_sdtype_warnings:
table_sdtype_warnings['Table Name'].extend(
[table_name] * len(table_sdtype_warnings['Column Name'])
)
df = pd.DataFrame(table_sdtype_warnings, columns=WARNINGS_COLUMN_ORDER)
warning_dataframes.append(df)

if warning_dataframes:
df = pd.concat(warning_dataframes)
warnings.warn(
"No 'datetime_format' is present in the metadata for the following columns:\n "
f'{df.to_string(index=False)}\n\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)

return errors

def _validate_foreign_keys(self, data):
Expand Down
34 changes: 30 additions & 4 deletions sdv/metadata/single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
import warnings
from collections import defaultdict
from copy import deepcopy
from datetime import datetime

Expand Down Expand Up @@ -1030,8 +1031,16 @@ def _get_invalid_column_values(column, validation_function):

return set(column[~valid])

def _validate_column_data(self, column):
"""Validate values of the column satisfy its sdtype properties."""
def _validate_column_data(self, column, sdtype_warnings):
"""Validate values of the column satisfy its sdtype properties.
Args:
column (pd.Series):
The data to validate against.
sdtype_warnings (defaultdict[list]):
A ``defaultdict`` with ``list`` to add warning messages to.
"""
column_metadata = self.columns[column.name]
sdtype = column_metadata['sdtype']
invalid_values = None
Expand Down Expand Up @@ -1060,13 +1069,18 @@ def _validate_column_data(self, column):
lambda x: pd.isna(x) | _is_datetime_type(x)
)

if datetime_format is None and column.dtype == 'O':
sdtype_warnings['Column Name'].append(column.name)
sdtype_warnings['sdtype'].append(sdtype)
sdtype_warnings['datetime_format'].append(datetime_format)

if invalid_values:
invalid_values = _format_invalid_values_string(invalid_values, 3)
return [f"Invalid values found for {sdtype} column '{column.name}': {invalid_values}."]

return []

def validate_data(self, data):
def validate_data(self, data, sdtype_warnings=None):
"""Validate the data matches the metadata.
Checks the metadata follows the following rules:
Expand All @@ -1079,6 +1093,7 @@ def validate_data(self, data):
data (pd.DataFrame):
The data to validate.
"""
sdtype_warnings = sdtype_warnings if sdtype_warnings is not None else defaultdict(list)
if not isinstance(data, pd.DataFrame):
raise ValueError(f'Data must be a DataFrame, not a {type(data)}.')

Expand All @@ -1094,7 +1109,18 @@ def validate_data(self, data):

# Every column must satisfy the properties of their sdtypes
for column in data:
errors += self._validate_column_data(data[column])
errors += self._validate_column_data(data[column], sdtype_warnings)

if sdtype_warnings is not None and len(sdtype_warnings):
df = pd.DataFrame(sdtype_warnings)
message = (
"No 'datetime_format' is present in the metadata for the following columns:\n"
f'{df.to_string(index=False)}\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)

warnings.warn(message)

if errors:
raise InvalidDataError(errors)
Expand Down

0 comments on commit c9b1ff2

Please sign in to comment.