sdv-dev · fealho · Dec 16, 2024 · Dec 4, 2024 · Dec 11, 2024 · Dec 13, 2024
@@ -138,6 +138,7 @@ def __init__(
         self.fitted = False
         self.formatters = {}
         self._primary_key = self.metadata.primary_key
+        self._warned_overflow = False
         self._prepared_for_fitting = False
         self._keys = deepcopy(self.metadata.alternate_keys)
         if self._primary_key:
@@ -934,6 +935,15 @@ def reverse_transform(self, data, reset_keys=False):
                         self.formatters.pop(column_name)
                 else:
                     raise ValueError(e)
+            except OverflowError:
+                if not self._warned_overflow:
+                    warnings.warn(
+                        f"The real data in '{self.table_name}' and column '{column_name}' was "
+                        f"stored as '{dtype}' but the synthetic data overflowed when casting back "
+                        'to this type. If this is a problem, please check your input data '
+                        'and metadata settings.'
+                    )
+                self._warned_overflow = True
 
         # reformat columns using the formatters
         for column in sampled_columns:

@@ -207,7 +207,7 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):
             self._enforce_table_size(child_name, table_name, scale, sampled_data)
 
             if child_name not in sampled_data:  # Sample based on only 1 parent
-                for _, row in sampled_data[table_name].iterrows():
+                for _, row in sampled_data[table_name].astype(object).iterrows():
                     self._add_child_rows(
                         child_name=child_name,
                         parent_name=table_name,
@@ -219,7 +219,9 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):
 
                 if child_name not in sampled_data:  # No child rows sampled, force row creation
                     num_rows_key = f'__{child_name}__{foreign_key}__num_rows'
-                    max_num_child_index = sampled_data[table_name][num_rows_key].idxmax()
+                    max_num_child_index = pd.to_numeric(
+                        sampled_data[table_name][num_rows_key], errors='coerce'
+                    ).idxmax()
                     parent_row = sampled_data[table_name].iloc[max_num_child_index]
 
                     self._add_child_rows(

@@ -118,6 +118,13 @@ def _finalize(self, sampled_data):
 
                     else:
                         raise ValueError(e)
+                except OverflowError:
+                    LOGGER.debug(
+                        f"The real data in '{table_name}' and column '{name}' was stored as "
+                        f"'{dtype}' but the synthetic data overflowed when casting back to "
+                        'this type. If this is a problem, please check your input data '
+                        'and metadata settings.'
+                    )
 
             final_data[table_name] = table_rows[list(dtypes.keys())]