From fcfd6b48be1728da6e809b22a8bbeae8d30a154a Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 20 Jun 2024 10:07:32 -0700 Subject: [PATCH 01/21] wip --- .../modin/supported/groupby_supported.rst | 2 +- .../snowpark/modin/pandas/groupby.py | 39 ++++++++++++- .../compiler/snowflake_query_compiler.py | 57 +++++++++++++++++++ tests/unit/modin/test_groupby_unsupported.py | 2 - 4 files changed, 95 insertions(+), 5 deletions(-) diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index f9ef001af29..2649109674d 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -166,7 +166,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``value_counts`` | N | | +| ``value_counts`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py index a373883317a..dae099ef30d 100644 --- a/src/snowflake/snowpark/modin/pandas/groupby.py +++ b/src/snowflake/snowpark/modin/pandas/groupby.py @@ -188,13 +188,22 @@ def sem(self, ddof=1): def value_counts( self, - subset=None, + subset: Optional[list[str]] = None, normalize: bool = False, sort: bool = True, ascending: bool = False, dropna: bool = True, ): - ErrorMessage.method_not_implemented_error(name="value_counts", class_="GroupBy") + return self._query_compiler.groupby_value_counts( + by=self._by, + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + dropna=dropna, + ) def mean( self, @@ -1314,6 +1323,32 @@ def get_group(self, name, obj=None): name="get_group", class_="SeriesGroupBy" ) + def value_counts( + self, + subset: Optional[list[str]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins: Optional[int] = None, + dropna: bool = True, + ): + # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter + result = self._query_compiler.groupby_value_counts( + by=self._by, + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + if self._kwargs["_as_index"]: + return pd.DataFrame(result) + else: + return pd.Series(result) + def validate_groupby_args( by: Any, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 50ce5e71310..8a11f26f229 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5034,6 +5034,63 @@ def groupby_all( drop=drop, ) + def groupby_value_counts( + self, + by: Any, + axis: int, + groupby_kwargs: dict[str, Any], + subset: Optional[list[str]], + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins: Optional[int] = None, + dropna: bool = True, + ) -> "SnowflakeQueryCompiler": + level = groupby_kwargs.get("level", None) + as_index = groupby_kwargs.get("as_index", True) + is_supported = check_is_groupby_supported_by_snowflake(by, level, axis) + if not is_supported: + ErrorMessage.not_implemented( + "Snowpark pandas GroupBy.value_counts does not yet support pd.Grouper, axis == 1, by != None and level != None, by containing any non-pandas hashable labels, or unsupported aggregation parameters." + ) + if not is_list_like(by): + by = [by] + if subset is not None: + if not isinstance(subset, (list, tuple)): + subset = [subset] + else: + subset = [ + label + for label in self._modin_frame.data_column_pandas_labels + if label not in by + ] + + if as_index: + # When as_index=True, the result is a Series with a MultiIndex index. Each column in the + # original frame is treated as a level of this index. + raise NotImplementedError() + else: + # When as_index=False, the result is a DataFrame where count/proportion is appended as a new column. + # This is effectively the result of df.value_counts() sorted on the `by` columns. + return ( + self.value_counts( + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + .reset_index() # Demote MultiIndex index back to normal columns + .sort_rows_by_column_values( # Sort on 'by' columns + columns=by, + ascending=[True] * len(by), + kind="quicksort", + na_position="last", + ignore_index=True, + ) + ) + def _get_dummies_helper( self, column: Hashable, diff --git a/tests/unit/modin/test_groupby_unsupported.py b/tests/unit/modin/test_groupby_unsupported.py index efc48724055..6bb27db446f 100644 --- a/tests/unit/modin/test_groupby_unsupported.py +++ b/tests/unit/modin/test_groupby_unsupported.py @@ -39,7 +39,6 @@ (lambda se: se.groupby("A").skew(), "skew"), (lambda se: se.groupby("A").take(2), "take"), (lambda se: se.groupby("A").expanding(), "expanding"), - (lambda se: se.groupby("A").value_counts(), "value_counts"), (lambda se: se.groupby("A").hist(), "hist"), (lambda se: se.groupby("A").plot(), "plot"), (lambda se: se.groupby("A").boxplot("test_group"), "boxplot"), @@ -83,7 +82,6 @@ def test_series_groupby_unsupported_methods_raises( (lambda df: df.groupby("A").skew(), "skew"), (lambda df: df.groupby("A").take(2), "take"), (lambda df: df.groupby("A").expanding(), "expanding"), - (lambda df: df.groupby("A").value_counts(), "value_counts"), (lambda df: df.groupby("A").hist(), "hist"), (lambda df: df.groupby("A").plot(), "plot"), (lambda df: df.groupby("A").boxplot("test_group"), "boxplot"), From 123dd8eed47f84ee3954d00e7f040666984dc9e3 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 20 Jun 2024 14:55:49 -0700 Subject: [PATCH 02/21] wip (need to fix normalize + subset) --- docs/source/modin/groupby.rst | 2 + .../snowpark/modin/pandas/groupby.py | 36 ++++---- .../compiler/snowflake_query_compiler.py | 59 ++++++++----- .../integ/modin/groupby/test_value_counts.py | 88 +++++++++++++++++++ 4 files changed, 147 insertions(+), 38 deletions(-) create mode 100644 tests/integ/modin/groupby/test_value_counts.py diff --git a/docs/source/modin/groupby.rst b/docs/source/modin/groupby.rst index 97c99ce383d..e27a3bcf547 100644 --- a/docs/source/modin/groupby.rst +++ b/docs/source/modin/groupby.rst @@ -59,6 +59,7 @@ GroupBy DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.tail + DataFrameGroupBy.value_counts DataFrameGroupBy.var .. rubric:: `SeriesGroupBy` computations / descriptive stats @@ -90,4 +91,5 @@ GroupBy SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.tail + SeriesGroupBy.value_counts SeriesGroupBy.var diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py index dae099ef30d..e982cfb2651 100644 --- a/src/snowflake/snowpark/modin/pandas/groupby.py +++ b/src/snowflake/snowpark/modin/pandas/groupby.py @@ -194,7 +194,7 @@ def value_counts( ascending: bool = False, dropna: bool = True, ): - return self._query_compiler.groupby_value_counts( + query_compiler = self._query_compiler.groupby_value_counts( by=self._by, axis=self._axis, groupby_kwargs=self._kwargs, @@ -204,6 +204,12 @@ def value_counts( ascending=ascending, dropna=dropna, ) + if self._as_index: + return pd.Series( + query_compiler=query_compiler, + name="proportion" if normalize else "count", + ) + return pd.DataFrame(query_compiler=query_compiler) def mean( self, @@ -1332,22 +1338,22 @@ def value_counts( bins: Optional[int] = None, dropna: bool = True, ): + # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter - result = self._query_compiler.groupby_value_counts( - by=self._by, - axis=self._axis, - groupby_kwargs=self._kwargs, - subset=subset, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna, + return pd.Series( + query_compiler=self._query_compiler.groupby_value_counts( + by=self._by, + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ), + name="proportion" if normalize else "count", ) - if self._kwargs["_as_index"]: - return pd.DataFrame(result) - else: - return pd.Series(result) def validate_groupby_args( diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 8a11f26f229..f28ab7772a0 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5066,30 +5066,43 @@ def groupby_value_counts( ] if as_index: - # When as_index=True, the result is a Series with a MultiIndex index. Each column in the - # original frame is treated as a level of this index. - raise NotImplementedError() - else: - # When as_index=False, the result is a DataFrame where count/proportion is appended as a new column. - # This is effectively the result of df.value_counts() sorted on the `by` columns. - return ( - self.value_counts( - subset=subset, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna, - ) - .reset_index() # Demote MultiIndex index back to normal columns - .sort_rows_by_column_values( # Sort on 'by' columns - columns=by, - ascending=[True] * len(by), - kind="quicksort", - na_position="last", - ignore_index=True, - ) + # When as_index=True, the result is a Series with a MultiIndex index. + result = self.value_counts( + subset=subset, + normalize=normalize, + sort=sort, + bins=bins, + dropna=dropna, ) + else: + # When as_index=False, the result is a DataFrame where count/proportion is appended as a new named column. + sort_cols = by + result = self.value_counts( + subset=subset, + normalize=normalize, + bins=bins, + dropna=dropna, + ).reset_index() + result = result.set_columns( + result._modin_frame.data_column_pandas_labels[:-1] + + ["proportion" if normalize else "count"] + ) + # Always sort the result by all columns except proportion/count + sort_cols = self._modin_frame.data_column_pandas_labels + ascending_cols = [True] * len(sort_cols) + if sort: + # When sort=True, also sort on the value column (always the last) + sort_cols.append( + result._modin_frame.data_column_pandas_labels[-1], + ) + ascending_cols.append(ascending) + return result.sort_rows_by_column_values( + columns=sort_cols, + ascending=ascending_cols, + kind="quicksort", + na_position="last", + ignore_index=not as_index, # When as_index=False, take the default positional index + ) def _get_dummies_helper( self, diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py new file mode 100644 index 00000000000..e9b7701bf46 --- /dev/null +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -0,0 +1,88 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result + +TEST_DATA = [ + { + "by": ["c", "b", "a", "a", "b", "b", "c", "a"], + "value1": ["ee", "aa", "bb", "aa", "bb", "cc", "dd", "aa"], + "value2": [1, 2, 3, 1, 1, 3, 2, 1], + }, + { + "by": ["key 1", None, None, "key 1", "key 2", "key 1"], + "value1": [None, "value", None, None, None, "value"], + "value2": ["value", None, None, None, "value", None], + }, + # Copied from pandas docs + { + "by": ["male", "male", "female", "male", "female", "male"], + "value1": ["low", "medium", "high", "low", "high", "low"], + "value2": ["US", "FR", "US", "FR", "FR", "FR"], + }, +] + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["by", "value1"], ["by", "value2"]]) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "subset", + [None, ["value1"], ["value2"], ["value1", "value2"]], +) +@pytest.mark.parametrize("normalize", [False]) +@pytest.mark.parametrize("dropna", [True, False]) +@sql_count_checker(query_count=1) +def test_value_counts_basic(test_data, by, as_index, subset, normalize, dropna): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=["by"], as_index=as_index).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ), + ) + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("subset", [["by"], []]) +def test_value_counts_subset_negative(test_data, subset): + with SqlCounter(query_count=1 if len(subset) > 0 else 0): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda x: x.groupby(by=["by"]).value_counts(subset=subset), + expect_exception=True, + ) + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@sql_count_checker(query_count=1) +def test_value_counts_sort_ascending(test_data, as_index, sort, ascending): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda x: x.groupby(by=["by"], as_index=as_index).value_counts( + sort=sort, ascending=ascending + ), + ) + + +@sql_count_checker(query_count=1) +def test_value_counts_series(): + by = ["a", "a", "b", "b", "a", "c"] + native_ser = native_pd.Series( + [0, 0, None, 1, None, 3], + ) + snow_ser = pd.Series(native_ser) + eval_snowpark_pandas_result( + snow_ser, native_ser, lambda ser: ser.groupby(by=by).value_counts() + ) From 5b1a46c109654ccc3778f68586c7ca86a8909b4c Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 16 Jul 2024 11:07:01 -0700 Subject: [PATCH 03/21] more wip (need to adjust by list) --- .../modin/plugin/compiler/snowflake_query_compiler.py | 6 +++++- tests/integ/modin/groupby/test_value_counts.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index f28ab7772a0..c87b1dfda1a 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5088,7 +5088,11 @@ def groupby_value_counts( + ["proportion" if normalize else "count"] ) # Always sort the result by all columns except proportion/count - sort_cols = self._modin_frame.data_column_pandas_labels + sort_cols = [ + label + for label in self._modin_frame.data_column_pandas_labels + if label in result._modin_frame.data_column_pandas_labels + ] ascending_cols = [True] * len(sort_cols) if sort: # When sort=True, also sort on the value column (always the last) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index e9b7701bf46..914f23f0d12 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -43,7 +43,7 @@ def test_value_counts_basic(test_data, by, as_index, subset, normalize, dropna): eval_snowpark_pandas_result( *create_test_dfs(test_data), - lambda df: df.groupby(by=["by"], as_index=as_index).value_counts( + lambda df: df.groupby(by=by, as_index=as_index).value_counts( subset=subset, normalize=normalize, dropna=dropna, From f9e0cea4d6270b47f395f9cd6e93b8af43db380e Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 11:17:41 -0700 Subject: [PATCH 04/21] fix stuff and tests --- .../compiler/snowflake_query_compiler.py | 59 +++-- .../integ/modin/groupby/test_value_counts.py | 214 +++++++++++++++++- 2 files changed, 243 insertions(+), 30 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c87b1dfda1a..c67684baa69 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5048,54 +5048,59 @@ def groupby_value_counts( ) -> "SnowflakeQueryCompiler": level = groupby_kwargs.get("level", None) as_index = groupby_kwargs.get("as_index", True) + groupby_sort = groupby_kwargs.get("sort", True) is_supported = check_is_groupby_supported_by_snowflake(by, level, axis) if not is_supported: ErrorMessage.not_implemented( - "Snowpark pandas GroupBy.value_counts does not yet support pd.Grouper, axis == 1, by != None and level != None, by containing any non-pandas hashable labels, or unsupported aggregation parameters." + f"Snowpark pandas GroupBy.value_counts {_GROUPBY_UNSUPPORTED_GROUPING_MESSAGE}" ) if not is_list_like(by): by = [by] if subset is not None: if not isinstance(subset, (list, tuple)): subset = [subset] - else: - subset = [ - label - for label in self._modin_frame.data_column_pandas_labels - if label not in by - ] + # All "by" columns must be included in the subset list passed to value_counts + subset = [by_label for by_label in by if by_label not in subset] + subset + else: + # If subset is unspecified, then all columns are part of it + subset = self._modin_frame.data_column_pandas_labels if as_index: # When as_index=True, the result is a Series with a MultiIndex index. result = self.value_counts( + # Use sort=False to preserve the original order + sort=False, subset=subset, normalize=normalize, - sort=sort, bins=bins, dropna=dropna, + normalize_within_groups=by, ) else: # When as_index=False, the result is a DataFrame where count/proportion is appended as a new named column. - sort_cols = by result = self.value_counts( + # Use sort=False to preserve the original order + sort=False, subset=subset, normalize=normalize, bins=bins, dropna=dropna, + normalize_within_groups=by, ).reset_index() result = result.set_columns( result._modin_frame.data_column_pandas_labels[:-1] + ["proportion" if normalize else "count"] ) - # Always sort the result by all columns except proportion/count - sort_cols = [ - label - for label in self._modin_frame.data_column_pandas_labels - if label in result._modin_frame.data_column_pandas_labels - ] + # Within groups, rows are ordered based on their first appearance in the input frame. + # Note that in native pandas, preservation of order on non-grouping columns is not guaranteed: + # https://github.com/pandas-dev/pandas/issues/59307 + sort_cols = [] + if groupby_sort: + # When groupby(sort=True), sort the result on the grouping columns + sort_cols = by ascending_cols = [True] * len(sort_cols) if sort: - # When sort=True, also sort on the value column (always the last) + # When sort=True, also sort on the count/proportion column (always the last) sort_cols.append( result._modin_frame.data_column_pandas_labels[-1], ) @@ -5103,7 +5108,7 @@ def groupby_value_counts( return result.sort_rows_by_column_values( columns=sort_cols, ascending=ascending_cols, - kind="quicksort", + kind="stable", na_position="last", ignore_index=not as_index, # When as_index=False, take the default positional index ) @@ -11552,6 +11557,8 @@ def value_counts( ascending: bool = False, bins: Optional[int] = None, dropna: bool = True, + *, + normalize_within_groups: Optional[list[str]] = None, ) -> "SnowflakeQueryCompiler": """ Counts the frequency or number of unique values of SnowflakeQueryCompiler. @@ -11576,6 +11583,10 @@ def value_counts( This argument is not supported yet. dropna : bool, default True Don't include counts of NaN. + normalize_within_groups : list[str], optional + If set, the normalize parameter will normalize based on the specified groups + rather than the entire dataset. This parameter is exclusive to the Snowpark pandas + query compiler and is only used internally to implement groupby_value_counts. """ # TODO: SNOW-924742 Support bins in Series.value_counts if bins is not None: @@ -11647,9 +11658,21 @@ def _value_counts_groupby( # they are normalized to percentages as [2/(2+1+1), 1/(2+1+1), 1/(2+1+1)] = [0.5, 0.25, 0.25] # by default, ratio_to_report returns a decimal column, whereas pandas returns a float column if normalize: + if normalize_within_groups: + # If normalize_within_groups is set, then the denominator for ratio_to_report should + # be the size of each group instead. + normalize_snowflake_quoted_identifiers = [ + entry[0] + for entry in internal_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + normalize_within_groups + ) + ] + window = Window.partition_by(normalize_snowflake_quoted_identifiers) + else: + window = None internal_frame = query_compiler._modin_frame.project_columns( [COUNT_LABEL], - builtin("ratio_to_report")(col(count_identifier)).over(), + builtin("ratio_to_report")(col(count_identifier)).over(window), ) count_identifier = internal_frame.data_column_snowflake_quoted_identifiers[ 0 diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 914f23f0d12..5667ec2bd2a 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -8,7 +8,11 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equal_to_pandas, + create_test_dfs, + eval_snowpark_pandas_result, +) TEST_DATA = [ { @@ -31,23 +35,205 @@ @pytest.mark.parametrize("test_data", TEST_DATA) -@pytest.mark.parametrize("by", ["by", ["by", "value1"], ["by", "value2"]]) -@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("by", ["by"]) # , ["by", "value1"], ["by", "value2"]]) @pytest.mark.parametrize( "subset", [None, ["value1"], ["value2"], ["value1", "value2"]], ) -@pytest.mark.parametrize("normalize", [False]) +@pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) @sql_count_checker(query_count=1) -def test_value_counts_basic(test_data, by, as_index, subset, normalize, dropna): - eval_snowpark_pandas_result( - *create_test_dfs(test_data), - lambda df: df.groupby(by=by, as_index=as_index).value_counts( +def test_value_counts_basic(test_data, by, subset, normalize, dropna): + # Tests here use check_like=True because Snowpark pandas will always preserve the original + # order of rows within groups, while native pandas provides this guarantee only for the + # grouping and count/proportion columns. We check this ordering behavior in a separate test. + # + # Note that when as_index=False, check_like is insufficient because columns that were originally + # part of the MultiIndex are demoted to data columns before value_counts returns, so check_like + # doesn't know to reorder by those columns. + # + # See these issues for details: + # https://github.com/pandas-dev/pandas/issues/59307 + # https://github.com/snowflakedb/snowpark-python/pull/1909 + # https://github.com/pandas-dev/pandas/issues/15833 + by_list = by if isinstance(by, list) else [by] + none_in_by_col = any(None in test_data[col] for col in by_list) + if not dropna and none_in_by_col: + # when dropna is False, pandas gives a different result because it drops all NaN + # keys in the multiindex + # https://github.com/pandas-dev/pandas/issues/56366 + # as a workaround, replace all Nones in the pandas frame with a sentinel value + # since NaNs are sorted last, we want the sentinel to sort to the end as well + VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" + snow_df, native_df = create_test_dfs(test_data) + snow_result = snow_df.groupby(by=by).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ) + native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) + native_result = native_df.groupby(by=by).value_counts( subset=subset, normalize=normalize, dropna=dropna, + ) + native_result.index = native_result.index.map( + lambda x: tuple(None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x) + ) + assert_snowpark_pandas_equal_to_pandas( + snow_result, native_result, check_like=True + ) + else: + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ), + check_like=True, + ) + + +@pytest.mark.parametrize( + "test_data, groupby_kwargs, sort, expected_result", + [ + ( + TEST_DATA[0], + { + "by": ["by"], + "as_index": True, + "sort": True, + }, + True, + # In pandas, the row [c, dd, 2] appears before [c, ee, 1] despite the latter + # appearing earlier in the original frame + # + # by value1 value2 + # a aa 1 2 + # bb 3 1 + # b aa 2 1 + # bb 1 1 + # cc 3 1 + # c ee 1 1 + # dd 2 1 + # Name: count, dtype: int64 + native_pd.Series( + [2, 1, 1, 1, 1, 1, 1], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("a", "aa", 1), + ("a", "bb", 3), + ("b", "aa", 2), + ("b", "bb", 1), + ("b", "cc", 3), + ("c", "ee", 1), + ("c", "dd", 2), + ], + names=["by", "value1", "value2"], + ), + ), + ), + ( + TEST_DATA[0], + { + "by": ["by"], + "as_index": True, + "sort": True, + }, + False, + # by value1 value2 + # a bb 3 1 + # aa 1 2 + # b aa 2 1 + # bb 1 1 + # cc 3 1 + # c ee 1 1 + # dd 2 1 + # Name: count, dtype: int64 + native_pd.Series( + [1, 2, 1, 1, 1, 1, 1], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("a", "bb", 3), + ("a", "aa", 1), + ("b", "aa", 2), + ("b", "bb", 1), + ("b", "cc", 3), + ("c", "ee", 1), + ("c", "dd", 2), + ], + names=["by", "value1", "value2"], + ), + ), ), + ( + TEST_DATA[0], + { + "by": ["by"], + "as_index": False, + "sort": True, + }, + True, + # by value1 value2 count + # 0 a aa 1 2 + # 1 a bb 3 1 + # 2 b aa 2 1 + # 3 b bb 1 1 + # 4 b cc 3 1 + # 5 c ee 1 1 + # 6 c dd 2 1 + native_pd.DataFrame( + { + "by": ["a", "a", "b", "b", "b", "c", "c"], + "value1": ["aa", "bb", "aa", "bb", "cc", "ee", "dd"], + "value2": [1, 3, 2, 1, 3, 1, 2], + "count": [2, 1, 1, 1, 1, 1, 1], + } + ), + ), + ( + TEST_DATA[0], + { + "by": ["by"], + "as_index": False, + "sort": True, + }, + False, + # by value1 value2 count + # 0 a bb 3 1 + # 1 a aa 1 2 + # 2 b aa 2 1 + # 3 b bb 1 1 + # 4 b cc 3 1 + # 5 c ee 1 1 + # 6 c dd 2 1 + native_pd.DataFrame( + { + "by": ["a", "a", "b", "b", "b", "c", "c"], + "value1": ["bb", "aa", "aa", "bb", "cc", "ee", "dd"], + "value2": [3, 1, 2, 1, 3, 1, 2], + "count": [1, 2, 1, 1, 1, 1, 1], + } + ), + ), + ], +) +@sql_count_checker(query_count=1) +def test_value_counts_pandas_issue_59307( + test_data, + groupby_kwargs, + sort, + expected_result, +): + # Tests that Snowpark pandas preserves the order of rows from the input frame within groups. + # When groupby(sort=True), native pandas guarantees order is preserved only for the grouping columns. + # https://github.com/pandas-dev/pandas/issues/59307 + assert_snowpark_pandas_equal_to_pandas( + pd.DataFrame(test_data).groupby(**groupby_kwargs).value_counts(sort=sort), + expected_result, ) @@ -64,18 +250,22 @@ def test_value_counts_subset_negative(test_data, subset): @pytest.mark.parametrize("test_data", TEST_DATA) @pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("groupby_sort", [True, False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @sql_count_checker(query_count=1) -def test_value_counts_sort_ascending(test_data, as_index, sort, ascending): +def test_value_counts_sort_ascending( + test_data, as_index, groupby_sort, sort, ascending +): eval_snowpark_pandas_result( *create_test_dfs(test_data), - lambda x: x.groupby(by=["by"], as_index=as_index).value_counts( - sort=sort, ascending=ascending - ), + lambda x: x.groupby( + by=["by"], as_index=as_index, sort=groupby_sort + ).value_counts(sort=sort, ascending=ascending), ) +# TODO parametrize on bins @sql_count_checker(query_count=1) def test_value_counts_series(): by = ["a", "a", "b", "b", "a", "c"] From bdf1f2f56adcddfc611022ab4ff3325eb12c3d08 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 12:05:25 -0700 Subject: [PATCH 05/21] series groupby value_counts --- .../snowpark/modin/pandas/groupby.py | 40 +++++++++++++------ .../integ/modin/groupby/test_value_counts.py | 19 ++++++++- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py index e982cfb2651..de89a48331b 100644 --- a/src/snowflake/snowpark/modin/pandas/groupby.py +++ b/src/snowflake/snowpark/modin/pandas/groupby.py @@ -49,6 +49,7 @@ create_groupby_transform_func, ) from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta +from snowflake.snowpark.modin.plugin._internal.utils import INDEX_LABEL from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -1339,19 +1340,34 @@ def value_counts( dropna: bool = True, ): # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions - # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter + # Modin upstream defaults to pandas for this method, so we need to either override this or + # rewrite this logic to be friendlier to other backends. + # + # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter. + qc = self._query_compiler + # The "by" list becomes the new index, which we then perform the group by on. We call + # reset_index to let the query compiler treat it as a data column so it can be grouped on. + if self._by is not None: + qc = ( + qc.set_index_from_series(pd.Series(self._by)._query_compiler) + .set_index_names([INDEX_LABEL]) + .reset_index() + ) + result_qc = qc.groupby_value_counts( + by=[INDEX_LABEL], + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + # Reset the names in the MultiIndex + result_qc = result_qc.set_index_names([None] * result_qc.nlevels()) return pd.Series( - query_compiler=self._query_compiler.groupby_value_counts( - by=self._by, - axis=self._axis, - groupby_kwargs=self._kwargs, - subset=subset, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna, - ), + query_compiler=result_qc, name="proportion" if normalize else "count", ) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 5667ec2bd2a..b7913533897 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -265,8 +265,9 @@ def test_value_counts_sort_ascending( ) -# TODO parametrize on bins -@sql_count_checker(query_count=1) +# An additional query is needed to validate the length of the by list +# A JOIN is needed to set the index to the by list +@sql_count_checker(query_count=2, join_count=1) def test_value_counts_series(): by = ["a", "a", "b", "b", "a", "c"] native_ser = native_pd.Series( @@ -276,3 +277,17 @@ def test_value_counts_series(): eval_snowpark_pandas_result( snow_ser, native_ser, lambda ser: ser.groupby(by=by).value_counts() ) + + +# 1 query always runs to validate the length of the by list +@sql_count_checker(query_count=1) +def test_value_counts_bins_unimplemented(): + by = ["a", "a", "b", "b", "a", "c"] + native_ser = native_pd.Series( + [0, 0, None, 1, None, 3], + ) + snow_ser = pd.Series(native_ser) + with pytest.raises(NotImplementedError): + eval_snowpark_pandas_result( + snow_ser, native_ser, lambda ser: ser.groupby(by=by).value_counts(bins=3) + ) From ff9a555f2eb21a1b9957af5ea222477ca3583154 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 13:17:02 -0700 Subject: [PATCH 06/21] add bad subset test --- .../integ/modin/groupby/test_value_counts.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index b7913533897..5ceaf2f2e0d 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, create_test_dfs, @@ -238,14 +238,17 @@ def test_value_counts_pandas_issue_59307( @pytest.mark.parametrize("test_data", TEST_DATA) -@pytest.mark.parametrize("subset", [["by"], []]) -def test_value_counts_subset_negative(test_data, subset): - with SqlCounter(query_count=1 if len(subset) > 0 else 0): - eval_snowpark_pandas_result( - *create_test_dfs(test_data), - lambda x: x.groupby(by=["by"]).value_counts(subset=subset), - expect_exception=True, - ) +@pytest.mark.parametrize("subset", [["bad_key"], ["by", "bad_key"]]) +# 1 query always runs to validate the length of the by list +@sql_count_checker(query_count=1) +def test_value_counts_bad_subset(test_data, subset): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda x: x.groupby(by=["by"]).value_counts(subset=subset), + expect_exception=True, + expect_exception_type=KeyError, + assert_exception_equal=False, + ) @pytest.mark.parametrize("test_data", TEST_DATA) From 3866071509c3db205492fe44fffb0cf3540f1468 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 14:31:54 -0700 Subject: [PATCH 07/21] fix ascending tests --- .../modin/supported/groupby_supported.rst | 2 +- .../integ/modin/groupby/test_value_counts.py | 198 +++++++++++++++--- 2 files changed, 168 insertions(+), 32 deletions(-) diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 2649109674d..d66d90773df 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -166,7 +166,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``value_counts`` | Y | | +| ``value_counts`` | P | SeriesGroupBy does not implement ``bins`` | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 5ceaf2f2e0d..19de54ccd11 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -36,6 +36,7 @@ @pytest.mark.parametrize("test_data", TEST_DATA) @pytest.mark.parametrize("by", ["by"]) # , ["by", "value1"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) @pytest.mark.parametrize( "subset", [None, ["value1"], ["value2"], ["value1", "value2"]], @@ -43,10 +44,10 @@ @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) @sql_count_checker(query_count=1) -def test_value_counts_basic(test_data, by, subset, normalize, dropna): - # Tests here use check_like=True because Snowpark pandas will always preserve the original - # order of rows within groups, while native pandas provides this guarantee only for the - # grouping and count/proportion columns. We check this ordering behavior in a separate test. +def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, dropna): + # In this test, we use check_like=True because Snowpark pandas will always preserve the original + # order of rows within groups, while native pandas provides this guarantee only for the grouping + # and count/proportion columns. We check this ordering behavior in a separate test. # # Note that when as_index=False, check_like is insufficient because columns that were originally # part of the MultiIndex are demoted to data columns before value_counts returns, so check_like @@ -66,13 +67,13 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): # since NaNs are sorted last, we want the sentinel to sort to the end as well VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" snow_df, native_df = create_test_dfs(test_data) - snow_result = snow_df.groupby(by=by).value_counts( + snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( subset=subset, normalize=normalize, dropna=dropna, ) native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) - native_result = native_df.groupby(by=by).value_counts( + native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( subset=subset, normalize=normalize, dropna=dropna, @@ -86,7 +87,7 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): else: eval_snowpark_pandas_result( *create_test_dfs(test_data), - lambda df: df.groupby(by=by).value_counts( + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( subset=subset, normalize=normalize, dropna=dropna, @@ -96,8 +97,9 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): @pytest.mark.parametrize( - "test_data, groupby_kwargs, sort, expected_result", + "test_data, groupby_kwargs, value_counts_kwargs, expected_result", [ + # value_counts(sort) ( TEST_DATA[0], { @@ -105,7 +107,10 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): "as_index": True, "sort": True, }, - True, + { + "sort": True, + "ascending": False, + }, # In pandas, the row [c, dd, 2] appears before [c, ee, 1] despite the latter # appearing earlier in the original frame # @@ -142,7 +147,10 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): "as_index": True, "sort": True, }, - False, + { + "sort": False, + "ascending": False, + }, # by value1 value2 # a bb 3 1 # aa 1 2 @@ -169,6 +177,7 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): ), ), ), + # groupby(as_index) ( TEST_DATA[0], { @@ -176,7 +185,10 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): "as_index": False, "sort": True, }, - True, + { + "sort": True, + "ascending": False, + }, # by value1 value2 count # 0 a aa 1 2 # 1 a bb 3 1 @@ -201,7 +213,10 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): "as_index": False, "sort": True, }, - False, + { + "sort": False, + "ascending": False, + }, # by value1 value2 count # 0 a bb 3 1 # 1 a aa 1 2 @@ -219,20 +234,158 @@ def test_value_counts_basic(test_data, by, subset, normalize, dropna): } ), ), + # value_counts(ascending) + ( + TEST_DATA[0], + { + "by": ["by"], + "as_index": True, + "sort": True, + }, + { + "sort": True, + "ascending": True, + }, + # by value1 value2 + # a bb 3 1 + # aa 1 2 + # b aa 2 1 + # bb 1 1 + # cc 3 1 + # c ee 1 1 + # dd 2 1 + # Name: count, dtype: int64 + native_pd.Series( + [1, 2, 1, 1, 1, 1, 1], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("a", "bb", 3), + ("a", "aa", 1), + ("b", "aa", 2), + ("b", "bb", 1), + ("b", "cc", 3), + ("c", "ee", 1), + ("c", "dd", 2), + ], + names=["by", "value1", "value2"], + ), + ), + ), + ( + TEST_DATA[2], + { + "by": ["by"], + "as_index": True, + "sort": False, + }, + { + "sort": False, + }, + # by value1 value2 + # male low US 1 + # medium FR 1 + # female high US 1 + # male low FR 2 + # female high FR 1 + # Name: count, dtype: int64 + native_pd.Series( + [1, 1, 1, 2, 1], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("male", "low", "US"), + ("male", "medium", "FR"), + ("female", "high", "US"), + ("male", "low", "FR"), + ("female", "high", "FR"), + ], + names=["by", "value1", "value2"], + ), + ), + ), + ( + TEST_DATA[2], + { + "by": ["by"], + "as_index": True, + "sort": False, + }, + { + "sort": True, + "ascending": True, + }, + # by value1 value2 + # male low US 1 + # medium FR 1 + # female high US 1 + # FR 1 + # male low FR 2 + # Name: count, dtype: int64 + native_pd.Series( + [1, 1, 1, 1, 2], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("male", "low", "US"), + ("male", "medium", "FR"), + ("female", "high", "US"), + ("female", "high", "FR"), + ("male", "low", "FR"), + ], + names=["by", "value1", "value2"], + ), + ), + ), + ( + TEST_DATA[2], + { + "by": ["by"], + "as_index": True, + "sort": True, + }, + { + "sort": True, + "ascending": True, + }, + # by value1 value2 + # female high US 1 + # FR 1 + # male low US 1 + # medium FR 1 + # low FR 2 + # Name: count, dtype: int64 + native_pd.Series( + [1, 1, 1, 1, 2], + name="count", + index=pd.MultiIndex.from_tuples( + [ + ("female", "high", "US"), + ("female", "high", "FR"), + ("male", "low", "US"), + ("male", "medium", "FR"), + ("male", "low", "FR"), + ], + names=["by", "value1", "value2"], + ), + ), + ), ], ) @sql_count_checker(query_count=1) def test_value_counts_pandas_issue_59307( test_data, groupby_kwargs, - sort, + value_counts_kwargs, expected_result, ): # Tests that Snowpark pandas preserves the order of rows from the input frame within groups. # When groupby(sort=True), native pandas guarantees order is preserved only for the grouping columns. # https://github.com/pandas-dev/pandas/issues/59307 assert_snowpark_pandas_equal_to_pandas( - pd.DataFrame(test_data).groupby(**groupby_kwargs).value_counts(sort=sort), + pd.DataFrame(test_data) + .groupby(**groupby_kwargs) + .value_counts(**value_counts_kwargs), expected_result, ) @@ -251,23 +404,6 @@ def test_value_counts_bad_subset(test_data, subset): ) -@pytest.mark.parametrize("test_data", TEST_DATA) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("groupby_sort", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@sql_count_checker(query_count=1) -def test_value_counts_sort_ascending( - test_data, as_index, groupby_sort, sort, ascending -): - eval_snowpark_pandas_result( - *create_test_dfs(test_data), - lambda x: x.groupby( - by=["by"], as_index=as_index, sort=groupby_sort - ).value_counts(sort=sort, ascending=ascending), - ) - - # An additional query is needed to validate the length of the by list # A JOIN is needed to set the index to the by list @sql_count_checker(query_count=2, join_count=1) From 1091d056cc664d79b7b2abaf703e1b89679630f2 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 14:53:40 -0700 Subject: [PATCH 08/21] fix subset checks --- .../compiler/snowflake_query_compiler.py | 12 ++- .../integ/modin/groupby/test_value_counts.py | 102 +++++++++++------- 2 files changed, 73 insertions(+), 41 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c67684baa69..9fdf1338651 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5056,14 +5056,20 @@ def groupby_value_counts( ) if not is_list_like(by): by = [by] + if len(set(by) & set(subset or [])): + # Check for overlap between by and subset. Since column names may contain customer data, + # unlike pandas, we do not include the offending labels in the error message. + raise ValueError("Keys in subset cannot be in the groupby column keys") if subset is not None: if not isinstance(subset, (list, tuple)): subset = [subset] - # All "by" columns must be included in the subset list passed to value_counts - subset = [by_label for by_label in by if by_label not in subset] + subset else: - # If subset is unspecified, then all columns are part of it + # If subset is unspecified, then all columns should be included. subset = self._modin_frame.data_column_pandas_labels + # The grouping columns are always included in the subset. + # Furthermore, the columns of the output must have the grouping columns first, in the order + # that they were specified. + subset = by + list(filter(lambda label: label not in by, subset)) if as_index: # When as_index=True, the result is a Series with a MultiIndex index. diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 19de54ccd11..6b82307f637 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, create_test_dfs, @@ -35,7 +35,7 @@ @pytest.mark.parametrize("test_data", TEST_DATA) -@pytest.mark.parametrize("by", ["by"]) # , ["by", "value1"], ["by", "value2"]]) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) @pytest.mark.parametrize("groupby_sort", [True, False]) @pytest.mark.parametrize( "subset", @@ -43,7 +43,6 @@ ) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) -@sql_count_checker(query_count=1) def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, dropna): # In this test, we use check_like=True because Snowpark pandas will always preserve the original # order of rows within groups, while native pandas provides this guarantee only for the grouping @@ -58,42 +57,62 @@ def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, drop # https://github.com/snowflakedb/snowpark-python/pull/1909 # https://github.com/pandas-dev/pandas/issues/15833 by_list = by if isinstance(by, list) else [by] - none_in_by_col = any(None in test_data[col] for col in by_list) - if not dropna and none_in_by_col: - # when dropna is False, pandas gives a different result because it drops all NaN - # keys in the multiindex - # https://github.com/pandas-dev/pandas/issues/56366 - # as a workaround, replace all Nones in the pandas frame with a sentinel value - # since NaNs are sorted last, we want the sentinel to sort to the end as well - VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" - snow_df, native_df = create_test_dfs(test_data) - snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, - ) - native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) - native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, - ) - native_result.index = native_result.index.map( - lambda x: tuple(None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x) - ) - assert_snowpark_pandas_equal_to_pandas( - snow_result, native_result, check_like=True - ) - else: - eval_snowpark_pandas_result( - *create_test_dfs(test_data), - lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + if len(set(by_list) & set(subset or [])): + # If subset and by overlap, check for ValueError + # Unlike pandas, we do not surface label names in the error message + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ), + expect_exception=True, + expect_exception_type=ValueError, + expect_exception_match="in subset cannot be in the groupby column keys", + assert_exception_equal=False, + ) + return + with SqlCounter(query_count=1): + none_in_by_col = any(None in test_data[col] for col in by_list) + if not dropna and none_in_by_col: + # when dropna is False, pandas gives a different result because it drops all NaN + # keys in the multiindex + # https://github.com/pandas-dev/pandas/issues/56366 + # as a workaround, replace all Nones in the pandas frame with a sentinel value + # since NaNs are sorted last, we want the sentinel to sort to the end as well + VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" + snow_df, native_df = create_test_dfs(test_data) + snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( subset=subset, normalize=normalize, dropna=dropna, - ), - check_like=True, - ) + ) + native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) + native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ) + native_result.index = native_result.index.map( + lambda x: tuple( + None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x + ) + ) + assert_snowpark_pandas_equal_to_pandas( + snow_result, native_result, check_like=True + ) + else: + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + subset=subset, + normalize=normalize, + dropna=dropna, + ), + check_like=True, + ) @pytest.mark.parametrize( @@ -391,10 +410,17 @@ def test_value_counts_pandas_issue_59307( @pytest.mark.parametrize("test_data", TEST_DATA) -@pytest.mark.parametrize("subset", [["bad_key"], ["by", "bad_key"]]) +@pytest.mark.parametrize( + "subset, exception_cls", + [ + (["bad_key"], KeyError), # key not in frame + (["by", "bad_key"], KeyError), # key not in frame + (["by"], ValueError), # subset cannot overlap with grouping columns + ], +) # 1 query always runs to validate the length of the by list @sql_count_checker(query_count=1) -def test_value_counts_bad_subset(test_data, subset): +def test_value_counts_bad_subset(test_data, subset, exception_cls): eval_snowpark_pandas_result( *create_test_dfs(test_data), lambda x: x.groupby(by=["by"]).value_counts(subset=subset), From 6c3c6acf93f6bf2b3a66e5312f6d1516cde3d764 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 14:56:15 -0700 Subject: [PATCH 09/21] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0767d5d3a0a..d79ae29d238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,6 +76,7 @@ - Added support for `Index.is_boolean`, `Index.is_integer`, `Index.is_floating`, `Index.is_numeric`, and `Index.is_object`. - Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`. - Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`. +- Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`. #### Improvements From 8e02d8bb06d2edb15285b42ca4eeaafc77a98268 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 15:37:00 -0700 Subject: [PATCH 10/21] fix bad subset test --- .../integ/modin/groupby/test_value_counts.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 6b82307f637..707080c535e 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -409,25 +409,24 @@ def test_value_counts_pandas_issue_59307( ) -@pytest.mark.parametrize("test_data", TEST_DATA) @pytest.mark.parametrize( "subset, exception_cls", [ (["bad_key"], KeyError), # key not in frame - (["by", "bad_key"], KeyError), # key not in frame (["by"], ValueError), # subset cannot overlap with grouping columns + (["by", "bad_key"], ValueError), # subset cannot overlap with grouping columns ], ) -# 1 query always runs to validate the length of the by list -@sql_count_checker(query_count=1) -def test_value_counts_bad_subset(test_data, subset, exception_cls): - eval_snowpark_pandas_result( - *create_test_dfs(test_data), - lambda x: x.groupby(by=["by"]).value_counts(subset=subset), - expect_exception=True, - expect_exception_type=KeyError, - assert_exception_equal=False, - ) +def test_value_counts_bad_subset(subset, exception_cls): + # for KeyError, 1 query always runs to validate the length of the by list + with SqlCounter(query_count=1 if exception_cls is KeyError else 0): + eval_snowpark_pandas_result( + *create_test_dfs(TEST_DATA[0]), + lambda x: x.groupby(by=["by"]).value_counts(subset=subset), + expect_exception=True, + expect_exception_type=exception_cls, + assert_exception_equal=False, + ) # An additional query is needed to validate the length of the by list From a41d97b0efcd37603ec1071602eb12237395e4bf Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 25 Jul 2024 15:57:24 -0700 Subject: [PATCH 11/21] add docstrings --- .../modin/plugin/docstrings/groupby.py | 138 +++++++++++++++++- 1 file changed, 136 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 05d29f64850..10023fe1c28 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -203,7 +203,111 @@ def sem(): pass def value_counts(): - pass + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + + normalize : bool, default False + Return proportions rather than frequencies. + + sort : bool, default True + Sort by frequencies. + + ascending : bool, default False + Sort in ascending order. + + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + :class:`~snowflake.snowpark.modin.pandas.Series` or :class:`~snowflake.snowpark.modin.pandas.DataFrame` + Series if the groupby as_index is True, otherwise DataFrame. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an additional column with the value_counts. + The column is labelled 'count' or 'proportion', depending on the normalize parameter. + + By default, rows that contain any NA values are omitted from the result. + + By default, the result will be in descending order so that the first element of each group is the most frequently-occurring row. + + **GroupBy.value_counts in Snowpark pandas may produce different results from vanilla pandas.** + This is because pandas internally uses a hash map to track counts, which results in row + orderings that are deterministic but platform-dependent. + + Snowpark pandas will always preserve the original order of rows in the input frame. When + `groupby` is called with `sort=True`, then the result is sorted on grouping columns; ties + are broken according to their original positions in the input frame. + When `value_counts` is called with `sort=True`, the result is sorted on the count/proportion + column; ties are again broken by their original positions. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df # doctest: +NORMALIZE_WHITESPACE + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() # doctest: +NORMALIZE_WHITESPACE + gender education country + female high US 1 + FR 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) # doctest: +NORMALIZE_WHITESPACE + gender education country + female high US 1 + FR 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE + gender education country + female high US 0.50 + FR 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() # doctest: +NORMALIZE_WHITESPACE + gender education country count + 0 female high US 1 + 1 female high FR 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE + gender education country proportion + 0 female high US 0.50 + 1 female high FR 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ def mean(): """ @@ -2103,8 +2207,38 @@ def size(): """ pass - def unique(self): + def unique(): pass def apply(): pass + + def value_counts(): + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + + normalize : bool, default False + Return proportions rather than frequencies. + + sort : bool, default True + Sort by frequencies. + + ascending : bool, default False + Sort in ascending order. + + bins : int, optional + Rather than count values, group them into half-open bins, a convenience for `pd.cut`, only works with numeric data. + This parameter is not yet supported in Snowpark pandas. + + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + :class:`~snowflake.snowpark.modin.pandas.Series` + """ From 3c40453ebfde1c2efdab6504dc3e414c873cf188 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Fri, 26 Jul 2024 10:27:48 -0700 Subject: [PATCH 12/21] reconcile value_counts in query compiler --- .../compiler/snowflake_query_compiler.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 9fdf1338651..d1c86ec3236 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -12,7 +12,7 @@ import uuid from collections.abc import Hashable, Iterable, Mapping, Sequence from datetime import timedelta, tzinfo -from typing import Any, Callable, List, Literal, Optional, Tuple, Union, get_args +from typing import Any, Callable, List, Literal, Optional, Union, get_args import numpy as np import numpy.typing as npt @@ -5054,6 +5054,8 @@ def groupby_value_counts( ErrorMessage.not_implemented( f"Snowpark pandas GroupBy.value_counts {_GROUPBY_UNSUPPORTED_GROUPING_MESSAGE}" ) + if bins is not None: + raise ErrorMessage.not_implemented("bins argument is not yet supported") if not is_list_like(by): by = [by] if len(set(by) & set(subset or [])): @@ -5062,34 +5064,36 @@ def groupby_value_counts( raise ValueError("Keys in subset cannot be in the groupby column keys") if subset is not None: if not isinstance(subset, (list, tuple)): - subset = [subset] + subset_list = [subset] + else: + subset_list = subset else: # If subset is unspecified, then all columns should be included. - subset = self._modin_frame.data_column_pandas_labels + subset_list = self._modin_frame.data_column_pandas_labels # The grouping columns are always included in the subset. # Furthermore, the columns of the output must have the grouping columns first, in the order # that they were specified. - subset = by + list(filter(lambda label: label not in by, subset)) + subset_list = by + list(filter(lambda label: label not in by, subset_list)) if as_index: # When as_index=True, the result is a Series with a MultiIndex index. - result = self.value_counts( + result = self._value_counts_groupby( + by=subset_list, # Use sort=False to preserve the original order sort=False, - subset=subset, normalize=normalize, - bins=bins, + ascending=False, dropna=dropna, normalize_within_groups=by, ) else: # When as_index=False, the result is a DataFrame where count/proportion is appended as a new named column. - result = self.value_counts( + result = self._value_counts_groupby( + by=subset_list, # Use sort=False to preserve the original order sort=False, - subset=subset, normalize=normalize, - bins=bins, + ascending=False, dropna=dropna, normalize_within_groups=by, ).reset_index() @@ -11563,8 +11567,6 @@ def value_counts( ascending: bool = False, bins: Optional[int] = None, dropna: bool = True, - *, - normalize_within_groups: Optional[list[str]] = None, ) -> "SnowflakeQueryCompiler": """ Counts the frequency or number of unique values of SnowflakeQueryCompiler. @@ -11589,10 +11591,6 @@ def value_counts( This argument is not supported yet. dropna : bool, default True Don't include counts of NaN. - normalize_within_groups : list[str], optional - If set, the normalize parameter will normalize based on the specified groups - rather than the entire dataset. This parameter is exclusive to the Snowpark pandas - query compiler and is only used internally to implement groupby_value_counts. """ # TODO: SNOW-924742 Support bins in Series.value_counts if bins is not None: @@ -11609,11 +11607,13 @@ def value_counts( def _value_counts_groupby( self, - by: Union[List[Hashable], Tuple[Hashable, ...]], + by: Sequence[Hashable], normalize: bool, sort: bool, ascending: bool, dropna: bool, + *, + normalize_within_groups: Optional[list[str]] = None, ) -> "SnowflakeQueryCompiler": """ Helper method to obtain the frequency or number of unique values @@ -11635,6 +11635,10 @@ def _value_counts_groupby( Sort in ascending order. dropna : bool Don't include counts of NaN. + normalize_within_groups : list[str], optional + If set, the normalize parameter will normalize based on the specified groups + rather than the entire dataset. This parameter is exclusive to the Snowpark pandas + query compiler and is only used internally to implement groupby_value_counts. """ self._raise_not_implemented_error_for_timedelta() From 7962fdefa3b981c349ae1e85add35ecd3bd45182 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 27 Aug 2024 17:26:11 -0700 Subject: [PATCH 13/21] update sorting behavior --- .../compiler/snowflake_query_compiler.py | 66 +++- .../integ/modin/groupby/test_value_counts.py | 361 ++---------------- 2 files changed, 105 insertions(+), 322 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index d1c86ec3236..9da08ab9a7f 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5101,9 +5101,62 @@ def groupby_value_counts( result._modin_frame.data_column_pandas_labels[:-1] + ["proportion" if normalize else "count"] ) - # Within groups, rows are ordered based on their first appearance in the input frame. - # Note that in native pandas, preservation of order on non-grouping columns is not guaranteed: + # pandas currently provides the following behaviors based on the different sort flags. + # These behaviors are not entirely consistent with documentation; see this issue for discussion: # https://github.com/pandas-dev/pandas/issues/59307 + # + # Example data (using pandas 2.2.1 behavior): + # >>> df = pd.DataFrame({"X": ["B", "A", "A", "B", "B", "B"], "Y": [4, 1, 3, -2, -1, -1]}) + # + # 1. groupby(sort=True).value_counts(sort=True) + # Sort on non-grouping columns, then sort on frequencies, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # A 1 1 + # 3 1 + # B -1 2 + # -2 1 + # 4 1 + # Name: count, dtype: int64 + # + # 2. groupby(sort=True).value_counts(sort=False) + # Sort on non-grouping columns, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # X Y + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # 4 1 + # Name: count, dtype: int64 + # + # 3. groupby(sort=False).value_counts(sort=True) + # Sort on frequencies. + # >>> df.groupby("X", sort=False).value_counts(sort=True) + # X Y + # B -1 2 + # 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # Name: count, dtype: int64 + # + # 4. groupby(sort=False).value_counts(sort=False) + # Sort on nothing (entries match the order of the original frame). + # X Y + # B 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # Name: count, dtype: int64 + # + # Lastly, when `normalize` is set with groupby(sort=False).value_counts(sort=True, normalize=True), + # pandas will sort by the pre-normalization counts rather than the resulting proportions. As this + # is an uncommon edge case, we cannot handle this using existing QC methods efficiently, so we just + # update our testing code to account for this. + # See comment on issue: https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 sort_cols = [] if groupby_sort: # When groupby(sort=True), sort the result on the grouping columns @@ -5115,6 +5168,15 @@ def groupby_value_counts( result._modin_frame.data_column_pandas_labels[-1], ) ascending_cols.append(ascending) + if groupby_sort: + # When groupby_sort=True, also sort by the non-grouping columns before sorting by + # the count/proportion column + # Exclude the grouping columns (always the first) from the sort + non_grouping_cols = result._modin_frame.index_column_pandas_labels[ + len(by) : + ] + sort_cols.extend(non_grouping_cols) + ascending_cols.extend([True] * len(non_grouping_cols)) return result.sort_rows_by_column_values( columns=sort_cols, ascending=ascending_cols, diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 707080c535e..83fe0eb7041 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -37,26 +37,23 @@ @pytest.mark.parametrize("test_data", TEST_DATA) @pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) @pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize( "subset", [None, ["value1"], ["value2"], ["value1", "value2"]], ) -@pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) -def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, dropna): - # In this test, we use check_like=True because Snowpark pandas will always preserve the original - # order of rows within groups, while native pandas provides this guarantee only for the grouping - # and count/proportion columns. We check this ordering behavior in a separate test. - # - # Note that when as_index=False, check_like is insufficient because columns that were originally - # part of the MultiIndex are demoted to data columns before value_counts returns, so check_like - # doesn't know to reorder by those columns. - # - # See these issues for details: - # https://github.com/pandas-dev/pandas/issues/59307 - # https://github.com/snowflakedb/snowpark-python/pull/1909 - # https://github.com/pandas-dev/pandas/issues/15833 +def test_value_counts_basic( + test_data, by, groupby_sort, sort, ascending, subset, dropna +): by_list = by if isinstance(by, list) else [by] + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "subset": subset, + "dropna": dropna, + } if len(set(by_list) & set(subset or [])): # If subset and by overlap, check for ValueError # Unlike pandas, we do not surface label names in the error message @@ -64,9 +61,7 @@ def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, drop eval_snowpark_pandas_result( *create_test_dfs(test_data), lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ), expect_exception=True, expect_exception_type=ValueError, @@ -85,327 +80,53 @@ def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, drop VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" snow_df, native_df = create_test_dfs(test_data) snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ) native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ) native_result.index = native_result.index.map( lambda x: tuple( None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x ) ) - assert_snowpark_pandas_equal_to_pandas( - snow_result, native_result, check_like=True - ) + assert_snowpark_pandas_equal_to_pandas(snow_result, native_result) else: eval_snowpark_pandas_result( *create_test_dfs(test_data), lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ), - check_like=True, ) -@pytest.mark.parametrize( - "test_data, groupby_kwargs, value_counts_kwargs, expected_result", - [ - # value_counts(sort) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": False, - }, - # In pandas, the row [c, dd, 2] appears before [c, ee, 1] despite the latter - # appearing earlier in the original frame - # - # by value1 value2 - # a aa 1 2 - # bb 3 1 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [2, 1, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "aa", 1), - ("a", "bb", 3), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": False, - "ascending": False, - }, - # by value1 value2 - # a bb 3 1 - # aa 1 2 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 2, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "bb", 3), - ("a", "aa", 1), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - # groupby(as_index) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": False, - "sort": True, - }, - { - "sort": True, - "ascending": False, - }, - # by value1 value2 count - # 0 a aa 1 2 - # 1 a bb 3 1 - # 2 b aa 2 1 - # 3 b bb 1 1 - # 4 b cc 3 1 - # 5 c ee 1 1 - # 6 c dd 2 1 - native_pd.DataFrame( - { - "by": ["a", "a", "b", "b", "b", "c", "c"], - "value1": ["aa", "bb", "aa", "bb", "cc", "ee", "dd"], - "value2": [1, 3, 2, 1, 3, 1, 2], - "count": [2, 1, 1, 1, 1, 1, 1], - } - ), - ), - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": False, - "sort": True, - }, - { - "sort": False, - "ascending": False, - }, - # by value1 value2 count - # 0 a bb 3 1 - # 1 a aa 1 2 - # 2 b aa 2 1 - # 3 b bb 1 1 - # 4 b cc 3 1 - # 5 c ee 1 1 - # 6 c dd 2 1 - native_pd.DataFrame( - { - "by": ["a", "a", "b", "b", "b", "c", "c"], - "value1": ["bb", "aa", "aa", "bb", "cc", "ee", "dd"], - "value2": [3, 1, 2, 1, 3, 1, 2], - "count": [1, 2, 1, 1, 1, 1, 1], - } - ), - ), - # value_counts(ascending) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # a bb 3 1 - # aa 1 2 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 2, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "bb", 3), - ("a", "aa", 1), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": False, - }, - { - "sort": False, - }, - # by value1 value2 - # male low US 1 - # medium FR 1 - # female high US 1 - # male low FR 2 - # female high FR 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 2, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("male", "low", "US"), - ("male", "medium", "FR"), - ("female", "high", "US"), - ("male", "low", "FR"), - ("female", "high", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": False, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # male low US 1 - # medium FR 1 - # female high US 1 - # FR 1 - # male low FR 2 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 1, 2], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("male", "low", "US"), - ("male", "medium", "FR"), - ("female", "high", "US"), - ("female", "high", "FR"), - ("male", "low", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # female high US 1 - # FR 1 - # male low US 1 - # medium FR 1 - # low FR 2 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 1, 2], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("female", "high", "US"), - ("female", "high", "FR"), - ("male", "low", "US"), - ("male", "medium", "FR"), - ("male", "low", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ], -) -@sql_count_checker(query_count=1) -def test_value_counts_pandas_issue_59307( - test_data, - groupby_kwargs, - value_counts_kwargs, - expected_result, +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_normalize( + test_data, by, groupby_sort, sort, ascending, normalize ): - # Tests that Snowpark pandas preserves the order of rows from the input frame within groups. - # When groupby(sort=True), native pandas guarantees order is preserved only for the grouping columns. - # https://github.com/pandas-dev/pandas/issues/59307 - assert_snowpark_pandas_equal_to_pandas( - pd.DataFrame(test_data) - .groupby(**groupby_kwargs) - .value_counts(**value_counts_kwargs), - expected_result, + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "normalize": normalize, + } + # When normalize is set, pandas will (counter-intuitively) sort by the pre-normalization + # counts rather than the result proportions. This only matters if groupby_sort is False + # and sort is True. + # We work around this by using check_like=True + # See https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 + check_like = not groupby_sort and sort and normalize + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ), + check_like=check_like, ) From 091fa63d3b36fb005e1cf801004bb52cd2a4344d Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 27 Aug 2024 17:29:14 -0700 Subject: [PATCH 14/21] remove disclaimer from docstring --- .../snowpark/modin/plugin/docstrings/groupby.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 10023fe1c28..70d8d568493 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -238,16 +238,6 @@ def value_counts(): By default, the result will be in descending order so that the first element of each group is the most frequently-occurring row. - **GroupBy.value_counts in Snowpark pandas may produce different results from vanilla pandas.** - This is because pandas internally uses a hash map to track counts, which results in row - orderings that are deterministic but platform-dependent. - - Snowpark pandas will always preserve the original order of rows in the input frame. When - `groupby` is called with `sort=True`, then the result is sorted on grouping columns; ties - are broken according to their original positions in the input frame. - When `value_counts` is called with `sort=True`, the result is sorted on the count/proportion - column; ties are again broken by their original positions. - Examples -------- >>> df = pd.DataFrame({ From f829f28ec8c23a7e40bc05a7c12e34d9beab640f Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 28 Aug 2024 17:49:52 -0700 Subject: [PATCH 15/21] fix sorting bug --- .../modin/plugin/compiler/snowflake_query_compiler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 9da08ab9a7f..70cfba44bed 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5170,10 +5170,13 @@ def groupby_value_counts( ascending_cols.append(ascending) if groupby_sort: # When groupby_sort=True, also sort by the non-grouping columns before sorting by - # the count/proportion column - # Exclude the grouping columns (always the first) from the sort - non_grouping_cols = result._modin_frame.index_column_pandas_labels[ - len(by) : + # the count/proportion column. The left-most column (nearest to the grouping columns + # is sorted on last). + # Exclude the grouping columns (always the first) from the sort. + non_grouping_cols = [ + col_label + for col_label in result._modin_frame.index_column_pandas_labels + if col_label not in by ] sort_cols.extend(non_grouping_cols) ascending_cols.extend([True] * len(non_grouping_cols)) From 457bbe9969555786417c35abc3f45cab03a26fc7 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 28 Aug 2024 20:43:53 -0700 Subject: [PATCH 16/21] add query counter --- tests/integ/modin/groupby/test_value_counts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 83fe0eb7041..ae4a14ba88b 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -107,6 +107,7 @@ def test_value_counts_basic( @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) +@sql_count_checker(query_count=1) def test_value_counts_normalize( test_data, by, groupby_sort, sort, ascending, normalize ): From b5da1f96e10a70b84bd137566c0aced8d13cad84 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 29 Aug 2024 13:15:57 -0700 Subject: [PATCH 17/21] fix as_index and doctests --- .../plugin/compiler/snowflake_query_compiler.py | 15 ++++++++------- .../snowpark/modin/plugin/docstrings/groupby.py | 12 ++++++------ tests/integ/modin/groupby/test_value_counts.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 70cfba44bed..ee2bb2ea23a 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5063,10 +5063,7 @@ def groupby_value_counts( # unlike pandas, we do not include the offending labels in the error message. raise ValueError("Keys in subset cannot be in the groupby column keys") if subset is not None: - if not isinstance(subset, (list, tuple)): - subset_list = [subset] - else: - subset_list = subset + subset_list = subset else: # If subset is unspecified, then all columns should be included. subset_list = self._modin_frame.data_column_pandas_labels @@ -5173,10 +5170,14 @@ def groupby_value_counts( # the count/proportion column. The left-most column (nearest to the grouping columns # is sorted on last). # Exclude the grouping columns (always the first) from the sort. + if as_index: + # When as_index is true, the non-grouping columns are part of the index columns + columns_to_filter = result._modin_frame.index_column_pandas_labels + else: + # When as_index is false, the non-grouping columns are part of the data columns + columns_to_filter = result._modin_frame.data_column_pandas_labels non_grouping_cols = [ - col_label - for col_label in result._modin_frame.index_column_pandas_labels - if col_label not in by + col_label for col_label in columns_to_filter if col_label not in by ] sort_cols.extend(non_grouping_cols) ascending_cols.extend([True] * len(non_grouping_cols)) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 70d8d568493..46fab52f85a 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -257,8 +257,8 @@ def value_counts(): >>> df.groupby('gender').value_counts() # doctest: +NORMALIZE_WHITESPACE gender education country - female high US 1 - FR 1 + female high FR 1 + US 1 male low FR 2 US 1 medium FR 1 @@ -266,8 +266,8 @@ def value_counts(): >>> df.groupby('gender').value_counts(ascending=True) # doctest: +NORMALIZE_WHITESPACE gender education country - female high US 1 - FR 1 + female high FR 1 + US 1 male low US 1 medium FR 1 low FR 2 @@ -275,8 +275,8 @@ def value_counts(): >>> df.groupby('gender').value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE gender education country - female high US 0.50 - FR 0.50 + female high FR 0.50 + US 0.50 male low FR 0.50 US 0.25 medium FR 0.25 diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index ae4a14ba88b..1f1b2f5c052 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -131,6 +131,21 @@ def test_value_counts_normalize( ) +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("as_index", [True, False]) +@sql_count_checker(query_count=1) +def test_value_counts_as_index(test_data, by, groupby_sort, sort, as_index): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort, as_index=as_index).value_counts( + sort=sort + ), + ) + + @pytest.mark.parametrize( "subset, exception_cls", [ From 0efe409553a93220d1414c65e363d5a6207826e8 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 29 Aug 2024 15:19:42 -0700 Subject: [PATCH 18/21] fix as_index doctests --- src/snowflake/snowpark/modin/plugin/docstrings/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 46fab52f85a..9b9efbafb0f 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -284,16 +284,16 @@ def value_counts(): >>> df.groupby('gender', as_index=False).value_counts() # doctest: +NORMALIZE_WHITESPACE gender education country count - 0 female high US 1 - 1 female high FR 1 + 0 female high FR 1 + 1 female high US 1 2 male low FR 2 3 male low US 1 4 male medium FR 1 >>> df.groupby('gender', as_index=False).value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE gender education country proportion - 0 female high US 0.50 - 1 female high FR 0.50 + 0 female high FR 0.50 + 1 female high US 0.50 2 male low FR 0.50 3 male low US 0.25 4 male medium FR 0.25 From 76f1fd7c65fac9f17a0ecaf64f04f49208572c57 Mon Sep 17 00:00:00 2001 From: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:00:27 -0700 Subject: [PATCH 19/21] Update docs/source/modin/supported/groupby_supported.rst Co-authored-by: Andong Zhan --- docs/source/modin/supported/groupby_supported.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index d66d90773df..231f61aa8c5 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -166,7 +166,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``value_counts`` | P | SeriesGroupBy does not implement ``bins`` | +| ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | +-----------------------------+---------------------------------+----------------------------------------------------+ From ad66e3075f9c1afca3ffed74152d8d6ccc4d5861 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 29 Aug 2024 21:29:45 -0700 Subject: [PATCH 20/21] fix doc spacing --- docs/source/modin/supported/groupby_supported.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 231f61aa8c5..3bcf3538216 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -166,7 +166,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy | +| ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | +-----------------------------+---------------------------------+----------------------------------------------------+ From e48dfe3efbcbce8f78e8c7ce35df7d839f3be87c Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Fri, 30 Aug 2024 13:00:51 -0700 Subject: [PATCH 21/21] update docstring --- .../snowpark/modin/plugin/docstrings/groupby.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 9b9efbafb0f..0692647b3f7 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -214,6 +214,13 @@ def value_counts(): normalize : bool, default False Return proportions rather than frequencies. + Note that when `normalize=True`, `groupby` is called with `sort=False`, and `value_counts` + is called with `sort=True`, Snowpark pandas will order results differently from + native pandas. This occurs because native pandas sorts on frequencies before converting + them to proportions, while Snowpark pandas computes proportions within groups before sorting. + + See issue for details: https://github.com/pandas-dev/pandas/issues/59307 + sort : bool, default True Sort by frequencies. @@ -2215,6 +2222,13 @@ def value_counts(): normalize : bool, default False Return proportions rather than frequencies. + Note that when `normalize=True`, `groupby` is called with `sort=False`, and `value_counts` + is called with `sort=True`, Snowpark pandas will order results differently from + native pandas. This occurs because native pandas sorts on frequencies before converting + them to proportions, while Snowpark pandas computes proportions within groups before sorting. + + See issue for details: https://github.com/pandas-dev/pandas/issues/59307 + sort : bool, default True Sort by frequencies.