From 925515c38ae9a2cf84a822894eeb13889a31a386 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 25 Nov 2024 13:51:57 -0800 Subject: [PATCH 1/4] [SNOW-1826257]: Refactor docs to provide one place for supported aggregation functions --- docs/source/modin/supported/agg_supp.rst | 60 +++++++++++++++++++ .../modin/supported/dataframe_supported.rst | 12 +--- .../modin/supported/groupby_supported.rst | 7 +-- .../modin/supported/series_supported.rst | 9 +-- tests/integ/modin/groupby/conftest.py | 2 + 5 files changed, 71 insertions(+), 19 deletions(-) create mode 100644 docs/source/modin/supported/agg_supp.rst diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst new file mode 100644 index 00000000000..d0d4a7f5963 --- /dev/null +++ b/docs/source/modin/supported/agg_supp.rst @@ -0,0 +1,60 @@ +Supported Aggregation Functions +==================================== + +This page lists which aggregation functions are supported by ``DataFrame.agg``, +``Series.agg``, ``DataFrameGroupBy.agg``, and ``SeriesGroupBy.agg``. +The following table is structured as follows: The first column contains the aggregation function's name. +The second column is a flag for whether or not the aggregation is supported by ``DataFrame.agg``. The +third column is a flag for whether or not the aggregation is supported by ``Series.agg``. The fourth column +is whether or not the aggregation is supported by ``DataFrameGroupBy.agg``. The fifth column is whether or not +the aggregation is supported by ``SeriesGroupBy.agg``. + +.. note:: + ``Y`` stands for yes, i.e., supports distributed implementation, ``N`` stands for no and API simply errors out, + and ``P`` stands for partial (meaning some parameters may not be supported yet). + + Both Python and NumPy functions are supported for ``DataFrameGroupBy.agg`` and ``SeriesGroupBy.agg``. + ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| Aggregation Function | ``DataFrame.agg`` supports? (Y/N/P) | ``Series.agg`` supports? (Y/N/P) | ``DataFrameGroupBy.agg`` supports? (Y/N/P) | ``SeriesGroupBy.agg`` supports? (Y/N/P) | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``count`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``mean`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``min`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``max`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``sum`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``median`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``size`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``std`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``var`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``quantile`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | +| | ``q`` is the default value or | default value or a scalar. | default value or a scalar. | default value or a scalar. | +| | a scalar. | | | | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``len`` | ``N`` | ``N`` | ``Y`` | ``Y`` | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 8f139ec5d36..36bc0e0f5d5 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -65,15 +65,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``margins``, ``observed``, | If ``axis == 0``: ``Y`` when function is one of | -| | | ``sort`` | ``count``, ``mean``, ``min``, ``max``, ``sum``, | -| | | | ``median``, ``size``; ``std`` and ``var`` | -| | | | supported with ``ddof=0`` or ``ddof=1``; | -| | | | ``quantile`` is supported when ``q`` is the | -| | | | default value or a scalar. | -| | | | If ``axis == 1``: ``Y`` when function is | -| | | | ``count``, ``min``, ``max``, or ``sum`` and the | -| | | | index is not a MultiIndex. | +| ``agg`` | P | ``margins``, ``observed``, | Check | +| | | ``sort`` | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``margins``, ``observed``, | See ``agg`` | | | | ``sort`` | | diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 3aa5c815f2a..bff9b43b647 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -30,10 +30,9 @@ Function application +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | GroupBy method | Snowpark implemented? (Y/N/P/D) | Missing parameters | Notes for current implementation | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``axis`` other than 0 is not | ``Y``, support functions are count, mean, min, max,| -| | | implemented. | sum, median, std, size, len, and var | -| | | | (including both Python and NumPy functions) | -| | | | otherwise ``N``. | +| ``agg`` | P | ``axis`` other than 0 is not | Check | +| | | implemented. | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``axis`` other than 0 is not | See ``agg`` | | | | implemented. | | diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 6521d9ffd39..3ec6a23dfa3 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -76,12 +76,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | | ``Y`` when function is one of ``count``, | -| | | | ``mean``, ``min``, ``max``, ``sum``, ``median``, | -| | | | ``size``; ``std`` and ``var`` supported with | -| | | | ``ddof=0`` or ``ddof=1``; ``quantile`` is | -| | | | supported when ``q`` is the default value | -| | | | or a scalar. | +| ``agg`` | P | | Check | +| | | | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | | See ``agg`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/tests/integ/modin/groupby/conftest.py b/tests/integ/modin/groupby/conftest.py index 2277794d71a..31aeafddb50 100644 --- a/tests/integ/modin/groupby/conftest.py +++ b/tests/integ/modin/groupby/conftest.py @@ -27,6 +27,8 @@ lambda gr: gr.median(), lambda gr: gr.var(), lambda gr: gr.var(ddof=0), + lambda gr: gr.quantile(), + lambda gr: gr.quantile(q=0.3), ] all_agg_methods = result_compatible_agg_methods + int_to_decimal_float_agg_methods From 2bed3cf1a09d280669985d9656e973b76930011b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 2 Dec 2024 14:14:12 -0800 Subject: [PATCH 2/4] Add orphan tag --- docs/source/modin/supported/agg_supp.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst index d0d4a7f5963..7838f499b29 100644 --- a/docs/source/modin/supported/agg_supp.rst +++ b/docs/source/modin/supported/agg_supp.rst @@ -1,3 +1,5 @@ +:orphan: + Supported Aggregation Functions ==================================== From 21b262641ee97dfd4fa81d5167e4885d44bbf050 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 2 Dec 2024 14:16:45 -0800 Subject: [PATCH 3/4] Address review comments --- docs/source/modin/supported/agg_supp.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst index 7838f499b29..5b3a2c174c0 100644 --- a/docs/source/modin/supported/agg_supp.rst +++ b/docs/source/modin/supported/agg_supp.rst @@ -12,10 +12,10 @@ is whether or not the aggregation is supported by ``DataFrameGroupBy.agg``. The the aggregation is supported by ``SeriesGroupBy.agg``. .. note:: - ``Y`` stands for yes, i.e., supports distributed implementation, ``N`` stands for no and API simply errors out, + ``Y`` stands for yes (supports distributed implementation), ``N`` stands for no (API simply errors out), and ``P`` stands for partial (meaning some parameters may not be supported yet). - Both Python and NumPy functions are supported for ``DataFrameGroupBy.agg`` and ``SeriesGroupBy.agg``. + Both Python builtin and NumPy functions are supported for ``DataFrameGroupBy.agg`` and ``SeriesGroupBy.agg``. +-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ | Aggregation Function | ``DataFrame.agg`` supports? (Y/N/P) | ``Series.agg`` supports? (Y/N/P) | ``DataFrameGroupBy.agg`` supports? (Y/N/P) | ``SeriesGroupBy.agg`` supports? (Y/N/P) | From 64d9a03fdf258952de0e3a6da76ec393da5685b2 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 12 Dec 2024 18:55:38 -0800 Subject: [PATCH 4/4] Fix bug --- tests/integ/modin/groupby/conftest.py | 2 - .../modin/groupby/test_groupby_basic_agg.py | 48 +++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/groupby/conftest.py b/tests/integ/modin/groupby/conftest.py index 31aeafddb50..2277794d71a 100644 --- a/tests/integ/modin/groupby/conftest.py +++ b/tests/integ/modin/groupby/conftest.py @@ -27,8 +27,6 @@ lambda gr: gr.median(), lambda gr: gr.var(), lambda gr: gr.var(ddof=0), - lambda gr: gr.quantile(), - lambda gr: gr.quantile(q=0.3), ] all_agg_methods = result_compatible_agg_methods + int_to_decimal_float_agg_methods diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index 10d1e84c568..ff4636a8bd9 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -284,6 +284,54 @@ def test_groupby_agg_with_float_dtypes_named_agg() -> None: ) +@pytest.mark.parametrize( + "grpby_fn", + [ + lambda gr: gr.quantile(), + lambda gr: gr.quantile(q=0.3), + ], +) +@sql_count_checker(query_count=1) +def test_groupby_agg_quantile_with_int_dtypes(grpby_fn) -> None: + native_df = native_pd.DataFrame( + { + "col1_grp": ["g1", "g2", "g0", "g0", "g2", "g3", "g0", "g2", "g3"], + "col2_int64": np.arange(9, dtype="int64") // 3, + "col3_int_identical": [2] * 9, + "col4_int32": np.arange(9, dtype="int32") // 4, + "col5_int16": np.arange(9, dtype="int16") // 3, + "col6_mixed": np.concatenate( + [ + np.arange(3, dtype="int64") // 3, + np.arange(3, dtype="int32") // 3, + np.arange(3, dtype="int16") // 3, + ] + ), + "col7_int_missing": [5, 6, np.nan, 2, 1, np.nan, 5, np.nan, np.nan], + "col8_mixed_missing": np.concatenate( + [ + np.arange(2, dtype="int64") // 3, + [np.nan], + np.arange(2, dtype="int32") // 3, + [np.nan], + np.arange(2, dtype="int16") // 3, + [np.nan], + ] + ), + } + ) + snowpark_pandas_df = pd.DataFrame(native_df) + by = "col1_grp" + snowpark_pandas_groupby = snowpark_pandas_df.groupby(by=by) + pandas_groupby = native_df.groupby(by=by) + eval_snowpark_pandas_result( + snowpark_pandas_groupby, + pandas_groupby, + grpby_fn, + comparator=assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, + ) + + @sql_count_checker(query_count=2) def test_groupby_agg_with_int_dtypes(int_to_decimal_float_agg_method) -> None: snowpark_pandas_df = pd.DataFrame(