Skip to content

Commit

Permalink
SNOW-1445832: Added support for DataFrame.stack (#1821)
Browse files Browse the repository at this point in the history
Signed-off-by: Naren Krishna <[email protected]>
Co-authored-by: Devin Petersohn <[email protected]>
  • Loading branch information
sfc-gh-nkrishna and sfc-gh-dpetersohn authored Jun 26, 2024
1 parent 197afbf commit c96417b
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
- Added support for `Series.case_when` except when condition or replacement is callable.
- Added documentation pages for `Index` and its APIs.
- Added support for `DataFrame.assign`.
- Added support for `DataFrame.stack`.

#### Bug Fixes

Expand Down
9 changes: 5 additions & 4 deletions docs/source/modin/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,14 @@ DataFrame
.. autosummary::
:toctree: pandas_api/

DataFrame.pivot_table
DataFrame.sort_values
DataFrame.sort_index
DataFrame.melt
DataFrame.nlargest
DataFrame.nsmallest
DataFrame.melt
DataFrame.pivot_table
DataFrame.sort_index
DataFrame.sort_values
DataFrame.squeeze
DataFrame.stack
DataFrame.T
DataFrame.transpose

Expand Down
3 changes: 2 additions & 1 deletion docs/source/modin/supported/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``squeeze`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``stack`` | N | | |
| ``stack`` | P | ``level``, | ``N`` for MultiIndex |
| | | ``future_stack`` is ignored | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``std`` | P | | ``N`` if ``ddof`` is not 0 or 1 |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
32 changes: 24 additions & 8 deletions src/snowflake/snowpark/modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2354,23 +2354,39 @@ def squeeze(self, axis: Axis | None = None):
return Series(query_compiler=self.T._query_compiler)
return self.copy()

@dataframe_not_implemented()
def stack(self, level=-1, dropna=True): # noqa: PR01, RT01, D200
def stack(
self,
level: int | str | list = -1,
dropna: bool | NoDefault = no_default,
sort: bool | NoDefault = no_default,
future_stack: bool = False, # ignored
):
"""
Stack the prescribed level(s) from columns to index.
"""
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
if not isinstance(self.columns, pandas.MultiIndex) or (
isinstance(self.columns, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.columns.nlevels
if future_stack is not False:
WarningMessage.ignored_argument( # pragma: no cover
operation="DataFrame.stack",
argument="future_stack",
message="future_stack parameter has been ignored with Snowflake execution engine",
)
if dropna is NoDefault:
dropna = True # pragma: no cover
if sort is NoDefault:
sort = True # pragma: no cover

# This ensures that non-pandas MultiIndex objects are caught.
is_multiindex = len(self.columns.names) > 1
if not is_multiindex or (
is_multiindex and is_list_like(level) and len(level) == self.columns.nlevels
):
return self._reduce_dimension(
query_compiler=self._query_compiler.stack(level, dropna)
query_compiler=self._query_compiler.stack(level, dropna, sort)
)
else:
return self.__constructor__(
query_compiler=self._query_compiler.stack(level, dropna)
query_compiler=self._query_compiler.stack(level, dropna, sort)
)

def sub(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2503,7 +2503,7 @@ def sort_rows_by_column_values(
na_position: Puts NaNs at the beginning if 'first'; 'last' puts NaNs at the end. Defaults to 'last'
ignore_index: If True, existing index is ignored and new index is generated which is a gap free
sequence from 0 to n-1. Defaults to False.
key: Apply the key function to the values before sorting. Fallback to native pandas if key is provided.
key: Apply the key function to the values before sorting.

Returns:
A new SnowflakeQueryCompiler instance after applying the sort.
Expand Down Expand Up @@ -14815,3 +14815,71 @@ def pct_change(
}
).frame
)

def stack(
self,
level: Union[int, str, list] = -1,
dropna: bool = True,
sort: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Stack the prescribed level(s) from columns to index.

Return a reshaped DataFrame or Series having a multi-level index with one
or more new inner-most levels compared to the current DataFrame. The new inner-most
levels are created by pivoting the columns of the current dataframe:
- if the columns have a single level, the output is a Series.
- if the columns have multiple levels, the new index level(s) is (are)
taken from the prescribed level(s) and the output is a DataFrame.

Parameters
----------
level : int, str, list, default -1
Level(s) to stack from the column axis onto the index axis,
defined as one index or label, or a list of indices or labels.

dropna : bool, default True
Whether to drop rows in the resulting Frame/Series with missing values. Stacking a
column level onto the index axis can create combinations of index and column values
that are missing from the original dataframe.

sort : bool, default True
Whether to sort the levels of the resulting MultiIndex.
"""
if level != -1:
ErrorMessage.not_implemented(
"Snowpark pandas doesn't yet support 'level != -1' in stack API",
)
if self._modin_frame.is_multiindex(axis=1):
ErrorMessage.not_implemented(
"Snowpark pandas doesn't support multiindex columns in stack API"
)

index_names = ["index"]
# Stack is equivalent to doing df.melt() with index reset, sorting the values, then setting the index
# Note that we always use sort_rows_by_column_values even if sort is False
qc = (
self.reset_index()
.melt(
id_vars=index_names,
value_vars=self.columns,
var_name="index_second_level",
value_name=MODIN_UNNAMED_SERIES_LABEL,
ignore_index=False,
)
.sort_rows_by_column_values(
columns=index_names, # type: ignore
ascending=[True],
kind="stable",
na_position="last",
ignore_index=False,
)
.replace(to_replace=UNPIVOT_NULL_REPLACE_VALUE, value=np.nan)
.set_index_from_columns(index_names + ["index_second_level"]) # type: ignore
.set_index_names([None, None])
)

if dropna:
return qc.dropna(axis=0, how="any", thresh=None)
else:
return qc
53 changes: 53 additions & 0 deletions src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3227,6 +3227,59 @@ def squeeze():
def stack():
"""
Stack the prescribed level(s) from columns to index.
Return a reshaped DataFrame or Series having a multi-level index with one
or more new inner-most levels compared to the current DataFrame. The new inner-most
levels are created by pivoting the columns of the current dataframe.
If the columns have a single level, the output is a Series.
If the columns have multiple levels, the new index level(s) is (are)
taken from the prescribed level(s) and the output is a DataFrame.
Parameters
----------
level : int, str, list, default -1
Level(s) to stack from the column axis onto the index axis,
defined as one index or label, or a list of indices or labels.
dropna : bool, default True
Whether to drop rows in the resulting Frame/Series with missing values. Stacking a
column level onto the index axis can create combinations of index and column values
that are missing from the original dataframe.
sort : bool, default True
Whether to sort the levels of the resulting MultiIndex.
future_stack : bool, default False
This argument is ignored in Snowpark pandas.
Returns
-------
DataFrame or Series
Stacked dataframe or series.
Notes
-----
level != -1 and MultiIndex dataframes are not yet supported by Snowpark pandas.
See Also
--------
DataFrame.unstack : Unstack prescribed level(s) from index axis onto column axis.
DataFrame.pivot : Reshape dataframe from long format to wide format.
DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame.
Examples
--------
>>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], index=['cat', 'dog'], columns=['weight', 'height'])
>>> df_single_level_cols
weight height
cat 0 1
dog 2 3
>>> df_single_level_cols.stack()
cat weight 0
height 1
dog weight 2
height 3
dtype: int64
"""

def sub():
Expand Down
54 changes: 54 additions & 0 deletions tests/integ/modin/frame/test_stack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#

import modin.pandas as pd
import numpy as np
import pytest

from tests.integ.modin.sql_counter import sql_count_checker
from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result


@pytest.mark.parametrize(
"data, index, columns",
[
([[0, 1], [2, 3]], ["cat", "dog"], ["weight", "height"]),
([[0, np.nan], [np.nan, 3]], ["cat", "dog"], ["weight", "height"]),
],
)
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@sql_count_checker(query_count=1)
def test_stack(data, index, columns, dropna, sort):
eval_snowpark_pandas_result(
*create_test_dfs(data=data, index=index, columns=columns),
lambda df: df.stack(dropna=dropna, sort=sort),
)


@sql_count_checker(query_count=0)
def test_stack_level_unsupported():
df_single_level_cols = pd.DataFrame(
[[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"]
)

with pytest.raises(
NotImplementedError,
match="Snowpark pandas doesn't yet support 'level != -1' in stack API",
):
df_single_level_cols.stack(level=0)


@sql_count_checker(query_count=0)
def test_stack_multiindex_unsupported():
multicol1 = pd.MultiIndex.from_tuples([("weight", "kg"), ("weight", "pounds")])
df_multi_level_cols1 = pd.DataFrame(
[[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1
)

with pytest.raises(
NotImplementedError,
match="Snowpark pandas doesn't support multiindex columns in stack API",
):
df_multi_level_cols1.stack()
1 change: 0 additions & 1 deletion tests/unit/modin/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def test_unsupported_general(general_method, kwargs):
["reorder_levels", {"order": ""}],
["sem", {}],
["set_flags", {}],
["stack", {}],
["style", {}],
["swapaxes", {"axis1": "", "axis2": ""}],
["swaplevel", {}],
Expand Down

0 comments on commit c96417b

Please sign in to comment.