Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch drop_missing_values in in drop_unknown_references to support null foreign keys by default #2081

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sdv/utils/poc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
from sdv.utils.utils import drop_unknown_references as utils_drop_unknown_references


def drop_unknown_references(data, metadata):
def drop_unknown_references(data, metadata, drop_missing_values=False, verbose=True):
"""Wrap the drop_unknown_references function from the utils module."""
warnings.warn(
"Please access the 'drop_unknown_references' function directly from the sdv.utils module"
'instead of sdv.utils.poc.',
FutureWarning,
)
return utils_drop_unknown_references(data, metadata)
return utils_drop_unknown_references(data, metadata, drop_missing_values, verbose)


def simplify_schema(data, metadata, verbose=True):
Expand Down
4 changes: 2 additions & 2 deletions sdv/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sdv.multi_table.utils import _drop_rows


def drop_unknown_references(data, metadata, drop_missing_values=True, verbose=True):
def drop_unknown_references(data, metadata, drop_missing_values=False, verbose=True):
"""Drop rows with unknown foreign keys.

Args:
Expand All @@ -22,7 +22,7 @@ def drop_unknown_references(data, metadata, drop_missing_values=True, verbose=Tr
drop_missing_values (bool):
Boolean describing whether or not to also drop foreign keys with missing values
If True, drop rows with missing values in the foreign keys.
Defaults to True.
Defaults to False.
verbose (bool):
If True, print information about the rows that are dropped.
Defaults to True.
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_drop_unknown_references_drop_missing_values(metadata, data, capsys):
data['child'].loc[4, 'parent_id'] = np.nan

# Run
cleaned_data = drop_unknown_references(data, metadata)
cleaned_data = drop_unknown_references(data, metadata, drop_missing_values=True)
metadata.validate_data(cleaned_data)
captured = capsys.readouterr()

Expand Down
8 changes: 6 additions & 2 deletions tests/unit/utils/test_poc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,21 @@ def test_drop_unknown_references(mock_drop_unknown_references):
# Setup
data = Mock()
metadata = Mock()
drop_missing_values = Mock()
verbose = Mock()
expected_message = re.escape(
"Please access the 'drop_unknown_references' function directly from the sdv.utils module"
'instead of sdv.utils.poc.'
)

# Run
with pytest.warns(FutureWarning, match=expected_message):
drop_unknown_references(data, metadata)
drop_unknown_references(data, metadata, drop_missing_values, verbose)

# Assert
mock_drop_unknown_references.assert_called_once_with(data, metadata)
mock_drop_unknown_references.assert_called_once_with(
data, metadata, drop_missing_values, verbose
)


@patch('sdv.utils.poc._get_total_estimated_columns')
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _drop_rows(data, metadata, drop_missing_values):
}
metadata.validate.assert_called_once()
metadata.validate_data.assert_called_once_with(data)
mock_drop_rows.assert_called_once_with(result, metadata, True)
mock_drop_rows.assert_called_once_with(result, metadata, False)
for table_name, table in result.items():
pd.testing.assert_frame_equal(table, expected_result[table_name])

Expand Down Expand Up @@ -189,7 +189,7 @@ def test_drop_unknown_references_with_nan(mock_validate_foreign_keys, mock_get_r
mock_get_rows_to_drop.return_value = defaultdict(set, {'child': {4}, 'grandchild': {0, 3, 4}})

# Run
result = drop_unknown_references(data, metadata, verbose=False)
result = drop_unknown_references(data, metadata, drop_missing_values=True, verbose=False)

# Assert
metadata.validate.assert_called_once()
Expand Down
Loading