diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index f1687cbc935..d699d1a13e1 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5073,9 +5073,62 @@ def groupby_value_counts( result._modin_frame.data_column_pandas_labels[:-1] + ["proportion" if normalize else "count"] ) - # Within groups, rows are ordered based on their first appearance in the input frame. - # Note that in native pandas, preservation of order on non-grouping columns is not guaranteed: + # pandas currently provides the following behaviors based on the different sort flags. + # These behaviors are not entirely consistent with documentation; see this issue for discussion: # https://github.com/pandas-dev/pandas/issues/59307 + # + # Example data (using pandas 2.2.1 behavior): + # >>> df = pd.DataFrame({"X": ["B", "A", "A", "B", "B", "B"], "Y": [4, 1, 3, -2, -1, -1]}) + # + # 1. groupby(sort=True).value_counts(sort=True) + # Sort on non-grouping columns, then sort on frequencies, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # A 1 1 + # 3 1 + # B -1 2 + # -2 1 + # 4 1 + # Name: count, dtype: int64 + # + # 2. groupby(sort=True).value_counts(sort=False) + # Sort on non-grouping columns, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # X Y + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # 4 1 + # Name: count, dtype: int64 + # + # 3. groupby(sort=False).value_counts(sort=True) + # Sort on frequencies. + # >>> df.groupby("X", sort=False).value_counts(sort=True) + # X Y + # B -1 2 + # 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # Name: count, dtype: int64 + # + # 4. groupby(sort=False).value_counts(sort=False) + # Sort on nothing (entries match the order of the original frame). + # X Y + # B 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # Name: count, dtype: int64 + # + # Lastly, when `normalize` is set with groupby(sort=False).value_counts(sort=True, normalize=True), + # pandas will sort by the pre-normalization counts rather than the resulting proportions. As this + # is an uncommon edge case, we cannot handle this using existing QC methods efficiently, so we just + # update our testing code to account for this. + # See comment on issue: https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 sort_cols = [] if groupby_sort: # When groupby(sort=True), sort the result on the grouping columns @@ -5087,6 +5140,15 @@ def groupby_value_counts( result._modin_frame.data_column_pandas_labels[-1], ) ascending_cols.append(ascending) + if groupby_sort: + # When groupby_sort=True, also sort by the non-grouping columns before sorting by + # the count/proportion column + # Exclude the grouping columns (always the first) from the sort + non_grouping_cols = result._modin_frame.index_column_pandas_labels[ + len(by) : + ] + sort_cols.extend(non_grouping_cols) + ascending_cols.extend([True] * len(non_grouping_cols)) return result.sort_rows_by_column_values( columns=sort_cols, ascending=ascending_cols, diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py index 707080c535e..83fe0eb7041 100644 --- a/tests/integ/modin/groupby/test_value_counts.py +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -37,26 +37,23 @@ @pytest.mark.parametrize("test_data", TEST_DATA) @pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) @pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize( "subset", [None, ["value1"], ["value2"], ["value1", "value2"]], ) -@pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) -def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, dropna): - # In this test, we use check_like=True because Snowpark pandas will always preserve the original - # order of rows within groups, while native pandas provides this guarantee only for the grouping - # and count/proportion columns. We check this ordering behavior in a separate test. - # - # Note that when as_index=False, check_like is insufficient because columns that were originally - # part of the MultiIndex are demoted to data columns before value_counts returns, so check_like - # doesn't know to reorder by those columns. - # - # See these issues for details: - # https://github.com/pandas-dev/pandas/issues/59307 - # https://github.com/snowflakedb/snowpark-python/pull/1909 - # https://github.com/pandas-dev/pandas/issues/15833 +def test_value_counts_basic( + test_data, by, groupby_sort, sort, ascending, subset, dropna +): by_list = by if isinstance(by, list) else [by] + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "subset": subset, + "dropna": dropna, + } if len(set(by_list) & set(subset or [])): # If subset and by overlap, check for ValueError # Unlike pandas, we do not surface label names in the error message @@ -64,9 +61,7 @@ def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, drop eval_snowpark_pandas_result( *create_test_dfs(test_data), lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ), expect_exception=True, expect_exception_type=ValueError, @@ -85,327 +80,53 @@ def test_value_counts_basic(test_data, by, groupby_sort, subset, normalize, drop VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" snow_df, native_df = create_test_dfs(test_data) snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ) native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ) native_result.index = native_result.index.map( lambda x: tuple( None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x ) ) - assert_snowpark_pandas_equal_to_pandas( - snow_result, native_result, check_like=True - ) + assert_snowpark_pandas_equal_to_pandas(snow_result, native_result) else: eval_snowpark_pandas_result( *create_test_dfs(test_data), lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( - subset=subset, - normalize=normalize, - dropna=dropna, + **value_counts_kwargs ), - check_like=True, ) -@pytest.mark.parametrize( - "test_data, groupby_kwargs, value_counts_kwargs, expected_result", - [ - # value_counts(sort) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": False, - }, - # In pandas, the row [c, dd, 2] appears before [c, ee, 1] despite the latter - # appearing earlier in the original frame - # - # by value1 value2 - # a aa 1 2 - # bb 3 1 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [2, 1, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "aa", 1), - ("a", "bb", 3), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": False, - "ascending": False, - }, - # by value1 value2 - # a bb 3 1 - # aa 1 2 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 2, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "bb", 3), - ("a", "aa", 1), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - # groupby(as_index) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": False, - "sort": True, - }, - { - "sort": True, - "ascending": False, - }, - # by value1 value2 count - # 0 a aa 1 2 - # 1 a bb 3 1 - # 2 b aa 2 1 - # 3 b bb 1 1 - # 4 b cc 3 1 - # 5 c ee 1 1 - # 6 c dd 2 1 - native_pd.DataFrame( - { - "by": ["a", "a", "b", "b", "b", "c", "c"], - "value1": ["aa", "bb", "aa", "bb", "cc", "ee", "dd"], - "value2": [1, 3, 2, 1, 3, 1, 2], - "count": [2, 1, 1, 1, 1, 1, 1], - } - ), - ), - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": False, - "sort": True, - }, - { - "sort": False, - "ascending": False, - }, - # by value1 value2 count - # 0 a bb 3 1 - # 1 a aa 1 2 - # 2 b aa 2 1 - # 3 b bb 1 1 - # 4 b cc 3 1 - # 5 c ee 1 1 - # 6 c dd 2 1 - native_pd.DataFrame( - { - "by": ["a", "a", "b", "b", "b", "c", "c"], - "value1": ["bb", "aa", "aa", "bb", "cc", "ee", "dd"], - "value2": [3, 1, 2, 1, 3, 1, 2], - "count": [1, 2, 1, 1, 1, 1, 1], - } - ), - ), - # value_counts(ascending) - ( - TEST_DATA[0], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # a bb 3 1 - # aa 1 2 - # b aa 2 1 - # bb 1 1 - # cc 3 1 - # c ee 1 1 - # dd 2 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 2, 1, 1, 1, 1, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("a", "bb", 3), - ("a", "aa", 1), - ("b", "aa", 2), - ("b", "bb", 1), - ("b", "cc", 3), - ("c", "ee", 1), - ("c", "dd", 2), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": False, - }, - { - "sort": False, - }, - # by value1 value2 - # male low US 1 - # medium FR 1 - # female high US 1 - # male low FR 2 - # female high FR 1 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 2, 1], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("male", "low", "US"), - ("male", "medium", "FR"), - ("female", "high", "US"), - ("male", "low", "FR"), - ("female", "high", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": False, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # male low US 1 - # medium FR 1 - # female high US 1 - # FR 1 - # male low FR 2 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 1, 2], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("male", "low", "US"), - ("male", "medium", "FR"), - ("female", "high", "US"), - ("female", "high", "FR"), - ("male", "low", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ( - TEST_DATA[2], - { - "by": ["by"], - "as_index": True, - "sort": True, - }, - { - "sort": True, - "ascending": True, - }, - # by value1 value2 - # female high US 1 - # FR 1 - # male low US 1 - # medium FR 1 - # low FR 2 - # Name: count, dtype: int64 - native_pd.Series( - [1, 1, 1, 1, 2], - name="count", - index=pd.MultiIndex.from_tuples( - [ - ("female", "high", "US"), - ("female", "high", "FR"), - ("male", "low", "US"), - ("male", "medium", "FR"), - ("male", "low", "FR"), - ], - names=["by", "value1", "value2"], - ), - ), - ), - ], -) -@sql_count_checker(query_count=1) -def test_value_counts_pandas_issue_59307( - test_data, - groupby_kwargs, - value_counts_kwargs, - expected_result, +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_normalize( + test_data, by, groupby_sort, sort, ascending, normalize ): - # Tests that Snowpark pandas preserves the order of rows from the input frame within groups. - # When groupby(sort=True), native pandas guarantees order is preserved only for the grouping columns. - # https://github.com/pandas-dev/pandas/issues/59307 - assert_snowpark_pandas_equal_to_pandas( - pd.DataFrame(test_data) - .groupby(**groupby_kwargs) - .value_counts(**value_counts_kwargs), - expected_result, + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "normalize": normalize, + } + # When normalize is set, pandas will (counter-intuitively) sort by the pre-normalization + # counts rather than the result proportions. This only matters if groupby_sort is False + # and sort is True. + # We work around this by using check_like=True + # See https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 + check_like = not groupby_sort and sort and normalize + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ), + check_like=check_like, )