Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure HMA diagnostic report is 1.0 for integer primary/foreign key with very large values #2314

Merged
merged 6 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def __init__(
self.fitted = False
self.formatters = {}
self._primary_key = self.metadata.primary_key
self._warned_overflow = False
self._prepared_for_fitting = False
self._keys = deepcopy(self.metadata.alternate_keys)
if self._primary_key:
Expand Down Expand Up @@ -934,6 +935,15 @@ def reverse_transform(self, data, reset_keys=False):
self.formatters.pop(column_name)
else:
raise ValueError(e)
except OverflowError:
if not self._warned_overflow:
warnings.warn(
f"The real data in '{self.table_name}' and column '{column_name}' was "
f"stored as '{dtype}' but the synthetic data overflowed when casting back "
'to this type. If this is a problem, please check your input data '
'and metadata settings.'
)
self._warned_overflow = True

# reformat columns using the formatters
for column in sampled_columns:
Expand Down
6 changes: 4 additions & 2 deletions sdv/sampling/hierarchical_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):
self._enforce_table_size(child_name, table_name, scale, sampled_data)

if child_name not in sampled_data: # Sample based on only 1 parent
for _, row in sampled_data[table_name].iterrows():
for _, row in sampled_data[table_name].astype(object).iterrows():
self._add_child_rows(
child_name=child_name,
parent_name=table_name,
Expand All @@ -219,7 +219,9 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):

if child_name not in sampled_data: # No child rows sampled, force row creation
num_rows_key = f'__{child_name}__{foreign_key}__num_rows'
max_num_child_index = sampled_data[table_name][num_rows_key].idxmax()
max_num_child_index = pd.to_numeric(
sampled_data[table_name][num_rows_key], errors='coerce'
).idxmax()
parent_row = sampled_data[table_name].iloc[max_num_child_index]

self._add_child_rows(
Expand Down
7 changes: 7 additions & 0 deletions sdv/sampling/independent_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ def _finalize(self, sampled_data):

else:
raise ValueError(e)
except OverflowError:
LOGGER.debug(
f"The real data in '{table_name}' and column '{name}' was stored as "
f"'{dtype}' but the synthetic data overflowed when casting back to "
'this type. If this is a problem, please check your input data '
'and metadata settings.'
)

final_data[table_name] = table_rows[list(dtypes.keys())]

Expand Down
Loading
Loading