Skip to content

Commit

Permalink
SNOW-1429873 Fix pandas groupby apply int not serializable issue (#1595)
Browse files Browse the repository at this point in the history
<!---
Please answer these questions before creating your pull request. Thanks!
--->

1. Which Jira issue is this PR addressing? Make sure that there is an
accompanying issue to your PR.

   <!---
   In this section, please add a Snowflake Jira issue number.
   
Note that if a corresponding GitHub issue exists, you should still
include
   the Snowflake Jira issue number. For example, for GitHub issue
#1400, you should
   add "SNOW-1335071" here.
    --->

Fixes SNOW-1429873 Fix pandas groupby apply int not serializable issue

2. Fill out the following pre-review checklist:

- [ ] I am adding a new automated test(s) to verify correctness of my
new code
   - [ ] I am adding new logging messages
   - [ ] I am adding a new telemetry message
   - [ ] I am adding new credentials
   - [ ] I am adding a new dependency

3. Please describe how your code solves the related issue.

Please write a short description of how your code change solves the
related issue.
  • Loading branch information
sfc-gh-azhan authored May 16, 2024
1 parent 5eda6e5 commit 11e2a92
Show file tree
Hide file tree
Showing 6 changed files with 5,041 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Fixed bug when performing multiple DataFrameGroupBy apply/transform operations on the same DataFrame.
- Fixed type hints for property methods, e.g. Series.empty.
- Fixed `pd.merge` and `Dataframe.merge` outer join behavior according to pandas 2.x.
- Fixed groupby apply int not serializable bug.

### Behavior Changes
- Given an input of type `Series`, `pd.qcut` always returns a `Series`.
Expand Down
12 changes: 10 additions & 2 deletions src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,13 @@ def convert_groupby_apply_dataframe_result_to_standard_schema(
dtype=object,
)
result_df["value"] = (
result_df["value"].apply(handle_missing_value_in_variant).astype(object)
result_df["value"]
.apply(
lambda v: handle_missing_value_in_variant(
convert_numpy_int_result_to_int(v)
)
)
.astype(object)
)
result_df["first_position_for_group"] = input_row_positions.iloc[0]
return result_df
Expand Down Expand Up @@ -689,7 +695,9 @@ def apply_func(x): # type: ignore[no-untyped-def] # pragma: no cover
# Calling tolist() convert np.int*, np.bool*, etc. (which is not
# json-serializable) to python native values
for e in x.apply(func, args=args, **kwargs).tolist():
result.append(handle_missing_value_in_variant(e))
result.append(
handle_missing_value_in_variant(convert_numpy_int_result_to_int(e))
)
return result

else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2823,6 +2823,12 @@ def groupby_apply(
+ f"level={level}, and axis={axis}"
)

if "include_groups" in agg_kwargs:
# exclude "include_groups" from the apply function kwargs
agg_kwargs.pop("include_groups")

# TODO: SNOW-1429855 support include_groups = False

sort = groupby_kwargs.get("sort", True)
as_index = groupby_kwargs.get("as_index", True)
dropna = groupby_kwargs.get("dropna", True)
Expand Down
23 changes: 23 additions & 0 deletions tests/integ/modin/groupby/test_groupby_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

import datetime
import pathlib
import sys

import cloudpickle
Expand Down Expand Up @@ -857,6 +858,28 @@ def operation(df):
operation,
)

# TODO: SNOW-1429855 test support include_groups = False
@sql_count_checker(
query_count=8,
udtf_count=UDTF_COUNT,
join_count=JOIN_COUNT,
)
def test_group_apply_return_df_from_lambda(self):
diamonds_path = (
pathlib.Path(__file__).parent.parent.parent.parent
/ "resources"
/ "diamonds.csv"
)
diamonds_pd = native_pd.read_csv(diamonds_path)
eval_snowpark_pandas_result(
pd.DataFrame(diamonds_pd),
diamonds_pd,
lambda diamonds: diamonds.groupby("cut").apply(
lambda x: x.sort_values("price", ascending=False).head(5),
include_groups=True,
),
)


class TestFuncReturnsSeries:
@pytest.mark.parametrize(
Expand Down
Loading

0 comments on commit 11e2a92

Please sign in to comment.