Skip to content

Commit

Permalink
Add warning for datetimes when they are missing datetime_format a…
Browse files Browse the repository at this point in the history
…nd are represented as `dtype` object (#1897)
  • Loading branch information
pvk-developer authored Apr 10, 2024
1 parent df7da68 commit ed52974
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 5 deletions.
33 changes: 32 additions & 1 deletion sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
create_columns_node, create_summarized_columns_node, visualize_graph)

LOGGER = logging.getLogger(__name__)
WARNINGS_COLUMN_ORDER = ['Table Name', 'Column Name', 'sdtype', 'datetime_format']


class MultiTableMetadata:
Expand Down Expand Up @@ -694,6 +695,7 @@ def _validate_single_table(self, errors):
errors.append(error_message)
try:
table.validate()

except Exception as error:
errors.append('\n')
title = f'Table: {table_name}'
Expand Down Expand Up @@ -753,9 +755,12 @@ def _validate_missing_tables(self, data):
def _validate_all_tables(self, data):
"""Validate every table of the data has a valid table/metadata pair."""
errors = []
warning_dataframes = []
for table_name, table_data in data.items():
table_sdtype_warnings = defaultdict(list)
try:
self.tables[table_name].validate_data(table_data)
with warnings.catch_warnings(record=True):
self.tables[table_name].validate_data(table_data, table_sdtype_warnings)

except InvalidDataError as error:
error_msg = f"Table: '{table_name}'"
Expand All @@ -770,6 +775,24 @@ def _validate_all_tables(self, data):
except KeyError:
continue

finally:
if table_sdtype_warnings:
table_sdtype_warnings['Table Name'].extend(
[table_name] * len(table_sdtype_warnings['Column Name'])
)
df = pd.DataFrame(table_sdtype_warnings, columns=WARNINGS_COLUMN_ORDER)
warning_dataframes.append(df)

if warning_dataframes:
warning_df = pd.concat(warning_dataframes)
warning_msg = (
"No 'datetime_format' is present in the metadata for the following columns:\n "
f'{warning_df.to_string(index=False)}\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)
warnings.warn(warning_msg)

return errors

def _validate_foreign_keys(self, data):
Expand Down Expand Up @@ -816,6 +839,14 @@ def validate_data(self, data):
Args:
data (pd.DataFrame):
The data to validate.
Raises:
InvalidDataError:
This error is being raised if the data is not matching its sdtype requirements.
Warns:
A warning is being raised if ``datetime_format`` is missing from a column represented
as ``object`` in the dataframe and its sdtype is ``datetime``.
"""
errors = []
errors += self._validate_missing_tables(data)
Expand Down
53 changes: 49 additions & 4 deletions sdv/metadata/single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
import warnings
from collections import defaultdict
from copy import deepcopy
from datetime import datetime

Expand Down Expand Up @@ -1034,8 +1035,24 @@ def _get_invalid_column_values(column, validation_function):

return set(column[~valid])

def _validate_column_data(self, column):
"""Validate values of the column satisfy its sdtype properties."""
def _validate_column_data(self, column, sdtype_warnings):
"""Validate the values of the given column against its specified sdtype properties.
The function checks the sdtype of the column and validates the data accordingly. If there
are any errors, those are being appended to a list of errors that will be returned.
Additionally ``sdtype_warnings`` is being updated with ``datetime_format`` warnings
to be raised later.
Args:
column (pd.Series):
The data to validate against.
sdtype_warnings (defaultdict[list]):
A ``defaultdict`` with ``list`` to add warning messages.
Returns:
list:
A list containing any validation error messages found during the process.
"""
column_metadata = self.columns[column.name]
sdtype = column_metadata['sdtype']
invalid_values = None
Expand Down Expand Up @@ -1064,25 +1081,42 @@ def _validate_column_data(self, column):
lambda x: pd.isna(x) | _is_datetime_type(x)
)

if datetime_format is None and column.dtype == 'O':
sdtype_warnings['Column Name'].append(column.name)
sdtype_warnings['sdtype'].append(sdtype)
sdtype_warnings['datetime_format'].append(datetime_format)

if invalid_values:
invalid_values = _format_invalid_values_string(invalid_values, 3)
return [f"Invalid values found for {sdtype} column '{column.name}': {invalid_values}."]

return []

def validate_data(self, data):
def validate_data(self, data, sdtype_warnings=None):
"""Validate the data matches the metadata.
Checks the metadata follows the following rules:
* data columns match the metadata
* keys don't have missing values
* primary or alternate keys are unique
* values of a column satisfy their sdtype
* datetimes represented as objects have ``datetime_format`` (warning only).
Args:
data (pd.DataFrame):
The data to validate.
sdtype_warnings (defaultdict[list] or None):
A ``defaultdict`` with ``list`` to add warning messages.
Raises:
InvalidDataError:
This error is being raised if the data is not matching its sdtype requirements.
Warns:
A warning is being raised if ``datetime_format`` is missing from a column represented
as ``object`` in the dataframe and its sdtype is ``datetime``.
"""
sdtype_warnings = sdtype_warnings if sdtype_warnings is not None else defaultdict(list)
if not isinstance(data, pd.DataFrame):
raise ValueError(f'Data must be a DataFrame, not a {type(data)}.')

Expand All @@ -1098,7 +1132,18 @@ def validate_data(self, data):

# Every column must satisfy the properties of their sdtypes
for column in data:
errors += self._validate_column_data(data[column])
errors += self._validate_column_data(data[column], sdtype_warnings)

if sdtype_warnings is not None and len(sdtype_warnings):
df = pd.DataFrame(sdtype_warnings)
message = (
"No 'datetime_format' is present in the metadata for the following columns:\n"
f'{df.to_string(index=False)}\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)

warnings.warn(message)

if errors:
raise InvalidDataError(errors)
Expand Down
53 changes: 53 additions & 0 deletions tests/unit/metadata/test_multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1408,6 +1408,59 @@ def test_validate_data_missing_foreign_keys(self):
with pytest.raises(InvalidDataError, match=error_msg):
metadata.validate_data(data)

def test_validate_data_datetime_warning(self):
"""Test validation for columns with datetime.
If the datetime format is not provided, a warning should be shwon if the ``dtype`` is
object.
"""
# Setup
metadata = get_multi_table_metadata()
data = get_multi_table_data()

data['upravna_enota']['warning_date_str'] = [
'2022-09-02',
'2022-09-16',
'2022-08-26',
'2022-08-26'
]
data['upravna_enota']['valid_date'] = [
'20220902110443000000',
'20220916230356000000',
'20220826173917000000',
'20220929111311000000'
]
data['upravna_enota']['datetime'] = pd.to_datetime([
'20220902',
'20220916',
'20220826',
'20220826'
])
metadata.add_column('upravna_enota', 'warning_date_str', sdtype='datetime')
metadata.add_column(
'upravna_enota',
'valid_date',
sdtype='datetime',
datetime_format='%Y%m%d%H%M%S%f'
)
metadata.add_column('upravna_enota', 'datetime', sdtype='datetime')

# Run and Assert
warning_df = pd.DataFrame({
'Table Name': ['upravna_enota'],
'Column Name': ['warning_date_str'],
'sdtype': ['datetime'],
'datetime_format': [None]
})
warning_msg = (
"No 'datetime_format' is present in the metadata for the following columns:\n "
f'{warning_df.to_string(index=False)}\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)
with pytest.warns(UserWarning, match=warning_msg):
metadata.validate_data(data)

@patch('sdv.metadata.multi_table.SingleTableMetadata')
def test_add_table(self, table_metadata_mock):
"""Test that the method adds the table name to ``instance.tables``."""
Expand Down
50 changes: 50 additions & 0 deletions tests/unit/metadata/test_single_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,56 @@ def test_validate_data_datetime_sdtype(self):
with pytest.raises(InvalidDataError, match=err_msg):
metadata.validate_data(data)

def test_validate_data_datetime_warning(self):
"""Test validation for columns with datetime.
If the datetime format is not provided, a warning should be shwon if the ``dtype`` is
object.
"""
# Setup
data = pd.DataFrame({
'warning_date_str': [
'2022-09-02',
'2022-09-16',
'2022-08-26',
'2022-08-26',
'2022-09-29'
],
'valid_date': [
'20220902110443000000',
'20220916230356000000',
'20220826173917000000',
'20220826212135000000',
'20220929111311000000'
],
'datetime': pd.to_datetime([
'20220902',
'20220916',
'20220826',
'20220826',
'20220929'
])
})
metadata = SingleTableMetadata()
metadata.add_column('warning_date_str', sdtype='datetime')
metadata.add_column('valid_date', sdtype='datetime', datetime_format='%Y%m%d%H%M%S%f')
metadata.add_column('datetime', sdtype='datetime')

# Run and Assert
warning_frame = pd.DataFrame({
'Column Name': ['warning_date_str'],
'sdtype': ['datetime'],
'datetime_format': [None]
})
warning_msg = (
"No 'datetime_format' is present in the metadata for the following columns:\n"
f'{warning_frame.to_string(index=False)}\n'
'Without this specification, SDV may not be able to accurately parse the data. '
"We recommend adding datetime formats using 'update_column'."
)
with pytest.warns(UserWarning, match=warning_msg):
metadata.validate_data(data)

def test_validate_data(self):
"""Test the method doesn't crash when the passed data is valid.
Expand Down

0 comments on commit ed52974

Please sign in to comment.