SNOW-1429873 Fix pandas groupby apply int not serializable issue (#1595)

1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR.  Fixes SNOW-1429873 Fix pandas groupby apply int not serializable issue 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency 3. Please describe how your code solves the related issue. Please write a short description of how your code change solves the related issue.
snowflakedb · May 16, 2024 · 11e2a92 · 11e2a92
1 parent 5eda6e5
commit 11e2a92
Show file tree

Hide file tree

Showing 6 changed files with 5,041 additions and 2 deletions.
diff --git a/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md b/src/snowflake/snowpark/modin/plugin/PANDAS_CHANGELOG.md
@@ -8,6 +8,7 @@
 - Fixed bug when performing multiple DataFrameGroupBy apply/transform operations on the same DataFrame.
 - Fixed type hints for property methods, e.g. Series.empty.
 - Fixed `pd.merge` and `Dataframe.merge` outer join behavior according to pandas 2.x.
+- Fixed groupby apply int not serializable bug.
 
 ### Behavior Changes
 - Given an input of type `Series`, `pd.qcut` always returns a `Series`.

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py
@@ -325,7 +325,13 @@ def convert_groupby_apply_dataframe_result_to_standard_schema(
         dtype=object,
     )
     result_df["value"] = (
-        result_df["value"].apply(handle_missing_value_in_variant).astype(object)
+        result_df["value"]
+        .apply(
+            lambda v: handle_missing_value_in_variant(
+                convert_numpy_int_result_to_int(v)
+            )
+        )
+        .astype(object)
     )
     result_df["first_position_for_group"] = input_row_positions.iloc[0]
     return result_df
@@ -689,7 +695,9 @@ def apply_func(x):  # type: ignore[no-untyped-def] # pragma: no cover
             # Calling tolist() convert np.int*, np.bool*, etc. (which is not
             # json-serializable) to python native values
             for e in x.apply(func, args=args, **kwargs).tolist():
-                result.append(handle_missing_value_in_variant(e))
+                result.append(
+                    handle_missing_value_in_variant(convert_numpy_int_result_to_int(e))
+                )
             return result
 
     else:

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -2823,6 +2823,12 @@ def groupby_apply(
                 + f"level={level}, and axis={axis}"
             )
 
+        if "include_groups" in agg_kwargs:
+            # exclude "include_groups" from the apply function kwargs
+            agg_kwargs.pop("include_groups")
+
+        # TODO: SNOW-1429855 support include_groups = False
+
         sort = groupby_kwargs.get("sort", True)
         as_index = groupby_kwargs.get("as_index", True)
         dropna = groupby_kwargs.get("dropna", True)

diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py
@@ -3,6 +3,7 @@
 #
 
 import datetime
+import pathlib
 import sys
 
 import cloudpickle
@@ -857,6 +858,28 @@ def operation(df):
                 operation,
             )
 
+    # TODO: SNOW-1429855 test support include_groups = False
+    @sql_count_checker(
+        query_count=8,
+        udtf_count=UDTF_COUNT,
+        join_count=JOIN_COUNT,
+    )
+    def test_group_apply_return_df_from_lambda(self):
+        diamonds_path = (
+            pathlib.Path(__file__).parent.parent.parent.parent
+            / "resources"
+            / "diamonds.csv"
+        )
+        diamonds_pd = native_pd.read_csv(diamonds_path)
+        eval_snowpark_pandas_result(
+            pd.DataFrame(diamonds_pd),
+            diamonds_pd,
+            lambda diamonds: diamonds.groupby("cut").apply(
+                lambda x: x.sort_values("price", ascending=False).head(5),
+                include_groups=True,
+            ),
+        )
+
 
 class TestFuncReturnsSeries:
     @pytest.mark.parametrize(