From fe1f3927c9c298c2204f4fe38420aab0689b0de4 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 19 Sep 2024 09:57:03 -0700 Subject: [PATCH] [SNOW-1429199]: Provide clearer error message for groupby aggregations (#2308) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1429199 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Ensure clearer error messages for groupby aggregations to help snowpark users. --------- Co-authored-by: Naren Krishna --- CHANGELOG.md | 1 + .../plugin/extensions/groupby_overrides.py | 17 ++++++++++++++--- .../modin/groupby/test_groupby_negative.py | 5 ++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61b3c20b890..b62532bd333 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - Improved `to_pandas` to persist the original timezone offset for TIMESTAMP_TZ type. - Improved `dtype` results for TIMESTAMP_TZ type to show correct timezone offset. +- Improved error message when passing non-bool value to `numeric_only` for groupby aggregations. #### New Features diff --git a/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py index f12217281c5..acd122bbe53 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py @@ -1173,9 +1173,20 @@ def _wrap_aggregation( Returns the same type as `self._df`. """ # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions - numeric_only = validate_bool_kwarg( - numeric_only, "numeric_only", none_allowed=True - ) + try: + numeric_only = validate_bool_kwarg( + numeric_only, "numeric_only", none_allowed=True + ) + except ValueError: + # SNOW-1429199: Snowpark users expect to be able to pass in the column to aggregate + # on in the aggregation method, e.g. df.groupby("COL0").sum("COL1"), but the pandas + # API's only accept the numeric_only argument, so users get an error complaining that + # the numeric_only kwarg expects a bool argument, but a string was passed in. Instead + # of that error, we can throw this error instead that will make it more clear to users + # what went wrong. + raise ValueError( + f"GroupBy aggregations like 'sum' take a 'numeric_only' argument that needs to be a bool, but a {type(numeric_only).__name__} value was passed in." + ) agg_args = tuple() if agg_args is None else agg_args agg_kwargs = dict() if agg_kwargs is None else agg_kwargs diff --git a/tests/integ/modin/groupby/test_groupby_negative.py b/tests/integ/modin/groupby/test_groupby_negative.py index 0c9c056c2a7..45560862a30 100644 --- a/tests/integ/modin/groupby/test_groupby_negative.py +++ b/tests/integ/modin/groupby/test_groupby_negative.py @@ -536,7 +536,10 @@ def test_groupby_agg_invalid_numeric_only( # treated as True. This behavior is confusing to customer, in Snowpark pandas, we do an # explicit type check, an errors it out if an invalid value is given. with pytest.raises( - ValueError, match=re.escape('For argument "numeric_only" expected type bool') + ValueError, + match=re.escape( + f"GroupBy aggregations like 'sum' take a 'numeric_only' argument that needs to be a bool, but a {type(numeric_only).__name__} value was passed in." + ), ): getattr(basic_snowpark_pandas_df.groupby("col1"), agg_method_name)( numeric_only=numeric_only