SNOW-1445832: Added support for DataFrame.stack (#1821)

Signed-off-by: Naren Krishna <[email protected]> Co-authored-by: Devin Petersohn <[email protected]>
snowflakedb · Jun 26, 2024 · c96417b · c96417b
1 parent 197afbf
commit c96417b
Show file tree

Hide file tree

Showing 8 changed files with 208 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -71,6 +71,7 @@
 - Added support for `Series.case_when` except when condition or replacement is callable.
 - Added documentation pages for `Index` and its APIs.
 - Added support for `DataFrame.assign`.
+- Added support for `DataFrame.stack`.
 
 #### Bug Fixes
 

diff --git a/docs/source/modin/dataframe.rst b/docs/source/modin/dataframe.rst
@@ -183,13 +183,14 @@ DataFrame
 .. autosummary::
     :toctree: pandas_api/
 
-    DataFrame.pivot_table
-    DataFrame.sort_values
-    DataFrame.sort_index
+    DataFrame.melt
     DataFrame.nlargest
     DataFrame.nsmallest
-    DataFrame.melt
+    DataFrame.pivot_table
+    DataFrame.sort_index
+    DataFrame.sort_values
     DataFrame.squeeze
+    DataFrame.stack
     DataFrame.T
     DataFrame.transpose
 

diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst
@@ -389,7 +389,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``squeeze``                 | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``stack``                   | N                               |                                  |                                                    |
+| ``stack``                   | P                               | ``level``,                       | ``N`` for MultiIndex                               |
+|                             |                                 | ``future_stack`` is ignored      |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``std``                     | P                               |                                  | ``N`` if ``ddof`` is not 0 or 1                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py
@@ -2354,23 +2354,39 @@ def squeeze(self, axis: Axis | None = None):
                 return Series(query_compiler=self.T._query_compiler)
         return self.copy()
 
-    @dataframe_not_implemented()
-    def stack(self, level=-1, dropna=True):  # noqa: PR01, RT01, D200
+    def stack(
+        self,
+        level: int | str | list = -1,
+        dropna: bool | NoDefault = no_default,
+        sort: bool | NoDefault = no_default,
+        future_stack: bool = False,  # ignored
+    ):
         """
         Stack the prescribed level(s) from columns to index.
         """
         # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
-        if not isinstance(self.columns, pandas.MultiIndex) or (
-            isinstance(self.columns, pandas.MultiIndex)
-            and is_list_like(level)
-            and len(level) == self.columns.nlevels
+        if future_stack is not False:
+            WarningMessage.ignored_argument(  # pragma: no cover
+                operation="DataFrame.stack",
+                argument="future_stack",
+                message="future_stack parameter has been ignored with Snowflake execution engine",
+            )
+        if dropna is NoDefault:
+            dropna = True  # pragma: no cover
+        if sort is NoDefault:
+            sort = True  # pragma: no cover
+
+        # This ensures that non-pandas MultiIndex objects are caught.
+        is_multiindex = len(self.columns.names) > 1
+        if not is_multiindex or (
+            is_multiindex and is_list_like(level) and len(level) == self.columns.nlevels
         ):
             return self._reduce_dimension(
-                query_compiler=self._query_compiler.stack(level, dropna)
+                query_compiler=self._query_compiler.stack(level, dropna, sort)
             )
         else:
             return self.__constructor__(
-                query_compiler=self._query_compiler.stack(level, dropna)
+                query_compiler=self._query_compiler.stack(level, dropna, sort)
             )
 
     def sub(

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -2503,7 +2503,7 @@ def sort_rows_by_column_values(
             na_position: Puts NaNs at the beginning if 'first'; 'last' puts NaNs at the end. Defaults to 'last'
             ignore_index: If True, existing index is ignored and new index is generated which is a gap free
                 sequence from 0 to n-1. Defaults to False.
-            key: Apply the key function to the values before sorting. Fallback to native pandas if key is provided.
+            key: Apply the key function to the values before sorting.
 
         Returns:
             A new SnowflakeQueryCompiler instance after applying the sort.
@@ -14815,3 +14815,71 @@ def pct_change(
                     }
                 ).frame
             )
+
+    def stack(
+        self,
+        level: Union[int, str, list] = -1,
+        dropna: bool = True,
+        sort: bool = True,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Stack the prescribed level(s) from columns to index.
+
+        Return a reshaped DataFrame or Series having a multi-level index with one
+        or more new inner-most levels compared to the current DataFrame. The new inner-most
+        levels are created by pivoting the columns of the current dataframe:
+            - if the columns have a single level, the output is a Series.
+            - if the columns have multiple levels, the new index level(s) is (are)
+              taken from the prescribed level(s) and the output is a DataFrame.
+
+        Parameters
+        ----------
+        level : int, str, list, default -1
+            Level(s) to stack from the column axis onto the index axis,
+            defined as one index or label, or a list of indices or labels.
+
+        dropna : bool, default True
+            Whether to drop rows in the resulting Frame/Series with missing values. Stacking a
+            column level onto the index axis can create combinations of index and column values
+            that are missing from the original dataframe.
+
+        sort : bool, default True
+            Whether to sort the levels of the resulting MultiIndex.
+        """
+        if level != -1:
+            ErrorMessage.not_implemented(
+                "Snowpark pandas doesn't yet support 'level != -1' in stack API",
+            )
+        if self._modin_frame.is_multiindex(axis=1):
+            ErrorMessage.not_implemented(
+                "Snowpark pandas doesn't support multiindex columns in stack API"
+            )
+
+        index_names = ["index"]
+        # Stack is equivalent to doing df.melt() with index reset, sorting the values, then setting the index
+        # Note that we always use sort_rows_by_column_values even if sort is False
+        qc = (
+            self.reset_index()
+            .melt(
+                id_vars=index_names,
+                value_vars=self.columns,
+                var_name="index_second_level",
+                value_name=MODIN_UNNAMED_SERIES_LABEL,
+                ignore_index=False,
+            )
+            .sort_rows_by_column_values(
+                columns=index_names,  # type: ignore
+                ascending=[True],
+                kind="stable",
+                na_position="last",
+                ignore_index=False,
+            )
+            .replace(to_replace=UNPIVOT_NULL_REPLACE_VALUE, value=np.nan)
+            .set_index_from_columns(index_names + ["index_second_level"])  # type: ignore
+            .set_index_names([None, None])
+        )
+
+        if dropna:
+            return qc.dropna(axis=0, how="any", thresh=None)
+        else:
+            return qc
diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
@@ -3227,6 +3227,59 @@ def squeeze():
     def stack():
         """
         Stack the prescribed level(s) from columns to index.
+
+        Return a reshaped DataFrame or Series having a multi-level index with one
+        or more new inner-most levels compared to the current DataFrame. The new inner-most
+        levels are created by pivoting the columns of the current dataframe.
+        If the columns have a single level, the output is a Series.
+        If the columns have multiple levels, the new index level(s) is (are)
+        taken from the prescribed level(s) and the output is a DataFrame.
+
+        Parameters
+        ----------
+        level : int, str, list, default -1
+            Level(s) to stack from the column axis onto the index axis,
+            defined as one index or label, or a list of indices or labels.
+
+        dropna : bool, default True
+            Whether to drop rows in the resulting Frame/Series with missing values. Stacking a
+            column level onto the index axis can create combinations of index and column values
+            that are missing from the original dataframe.
+
+        sort : bool, default True
+            Whether to sort the levels of the resulting MultiIndex.
+
+        future_stack : bool, default False
+            This argument is ignored in Snowpark pandas.
+
+        Returns
+        -------
+        DataFrame or Series
+            Stacked dataframe or series.
+
+        Notes
+        -----
+        level != -1 and MultiIndex dataframes are not yet supported by Snowpark pandas.
+
+        See Also
+        --------
+        DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis.
+        DataFrame.pivot : Reshape dataframe from long format to wide format.
+        DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame.
+
+        Examples
+        --------
+        >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
+        >>> df_single_level_cols
+             weight  height
+        cat       0       1
+        dog       2       3
+        >>> df_single_level_cols.stack()
+        cat  weight    0
+             height    1
+        dog  weight    2
+             height    3
+        dtype: int64
         """
 
     def sub():

diff --git a/tests/integ/modin/frame/test_stack.py b/tests/integ/modin/frame/test_stack.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+
+import modin.pandas as pd
+import numpy as np
+import pytest
+
+from tests.integ.modin.sql_counter import sql_count_checker
+from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result
+
+
+@pytest.mark.parametrize(
+    "data, index, columns",
+    [
+        ([[0, 1], [2, 3]], ["cat", "dog"], ["weight", "height"]),
+        ([[0, np.nan], [np.nan, 3]], ["cat", "dog"], ["weight", "height"]),
+    ],
+)
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@sql_count_checker(query_count=1)
+def test_stack(data, index, columns, dropna, sort):
+    eval_snowpark_pandas_result(
+        *create_test_dfs(data=data, index=index, columns=columns),
+        lambda df: df.stack(dropna=dropna, sort=sort),
+    )
+
+
+@sql_count_checker(query_count=0)
+def test_stack_level_unsupported():
+    df_single_level_cols = pd.DataFrame(
+        [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"]
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Snowpark pandas doesn't yet support 'level != -1' in stack API",
+    ):
+        df_single_level_cols.stack(level=0)
+
+
+@sql_count_checker(query_count=0)
+def test_stack_multiindex_unsupported():
+    multicol1 = pd.MultiIndex.from_tuples([("weight", "kg"), ("weight", "pounds")])
+    df_multi_level_cols1 = pd.DataFrame(
+        [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Snowpark pandas doesn't support multiindex columns in stack API",
+    ):
+        df_multi_level_cols1.stack()
diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py
@@ -104,7 +104,6 @@ def test_unsupported_general(general_method, kwargs):
         ["reorder_levels", {"order": ""}],
         ["sem", {}],
         ["set_flags", {}],
-        ["stack", {}],
         ["style", {}],
         ["swapaxes", {"axis1": "", "axis2": ""}],
         ["swaplevel", {}],