From 665bef1e493048fc662845209336577ec0b3bb6b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 16 Dec 2024 15:07:09 -0800 Subject: [PATCH] [SNOW-1826257]: Refactor docs to provide one place for supported aggregation functions (#2680) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1826257 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://docs.google.com/document/d/162d_i4zZ2AfcGRXojj0jByt8EUq-DrSHPPnTa4QvwbA/edit#bookmark=id.e82u4nekq80k) 3. Please describe how your code solves the related issue. There are four functions that provide aggregation - DataFrame.agg, Series.agg, DataFrameGroupBy.agg, and SeriesGroupBy.agg. They share similar code paths, so whenever an aggregation function is added, it generally supports more than one of the APIs. We document each API in a different page though, so code authors need to update 3-4 different docs - which can lead to inconsistent docs (in the case that someone forgets to update one or all of the docs). This refactor moves all documentation of supported aggregation functions to one page, which should help keep the docs consistent and correct. --- docs/source/modin/supported/agg_supp.rst | 62 +++++++++++++++++++ .../modin/supported/dataframe_supported.rst | 12 +--- .../modin/supported/groupby_supported.rst | 7 +-- .../modin/supported/series_supported.rst | 9 +-- .../modin/groupby/test_groupby_basic_agg.py | 48 ++++++++++++++ 5 files changed, 119 insertions(+), 19 deletions(-) create mode 100644 docs/source/modin/supported/agg_supp.rst diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst new file mode 100644 index 00000000000..5b3a2c174c0 --- /dev/null +++ b/docs/source/modin/supported/agg_supp.rst @@ -0,0 +1,62 @@ +:orphan: + +Supported Aggregation Functions +==================================== + +This page lists which aggregation functions are supported by ``DataFrame.agg``, +``Series.agg``, ``DataFrameGroupBy.agg``, and ``SeriesGroupBy.agg``. +The following table is structured as follows: The first column contains the aggregation function's name. +The second column is a flag for whether or not the aggregation is supported by ``DataFrame.agg``. The +third column is a flag for whether or not the aggregation is supported by ``Series.agg``. The fourth column +is whether or not the aggregation is supported by ``DataFrameGroupBy.agg``. The fifth column is whether or not +the aggregation is supported by ``SeriesGroupBy.agg``. + +.. note:: + ``Y`` stands for yes (supports distributed implementation), ``N`` stands for no (API simply errors out), + and ``P`` stands for partial (meaning some parameters may not be supported yet). + + Both Python builtin and NumPy functions are supported for ``DataFrameGroupBy.agg`` and ``SeriesGroupBy.agg``. + ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| Aggregation Function | ``DataFrame.agg`` supports? (Y/N/P) | ``Series.agg`` supports? (Y/N/P) | ``DataFrameGroupBy.agg`` supports? (Y/N/P) | ``SeriesGroupBy.agg`` supports? (Y/N/P) | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``count`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``mean`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``min`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``max`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``sum`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | For ``axis=1``, ``Y`` if index is | | | | +| | not a MultiIndex. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``median`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``size`` | ``Y`` for ``axis=0``. | ``Y`` | ``Y`` | ``Y`` | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``std`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``var`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | ``P`` - only when ``ddof=0`` | +| | ``ddof=0`` or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | or ``ddof=1``. | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``quantile`` | ``P`` for ``axis=0`` - only when | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | ``P`` - only when ``q`` is the | +| | ``q`` is the default value or | default value or a scalar. | default value or a scalar. | default value or a scalar. | +| | a scalar. | | | | +| | ``N`` for ``axis=1``. | | | | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ +| ``len`` | ``N`` | ``N`` | ``Y`` | ``Y`` | ++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+ diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 1ceee7507a6..f86e0f0e772 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -65,15 +65,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``margins``, ``observed``, | If ``axis == 0``: ``Y`` when function is one of | -| | | ``sort`` | ``count``, ``mean``, ``min``, ``max``, ``sum``, | -| | | | ``median``, ``size``; ``std`` and ``var`` | -| | | | supported with ``ddof=0`` or ``ddof=1``; | -| | | | ``quantile`` is supported when ``q`` is the | -| | | | default value or a scalar. | -| | | | If ``axis == 1``: ``Y`` when function is | -| | | | ``count``, ``min``, ``max``, or ``sum`` and the | -| | | | index is not a MultiIndex. | +| ``agg`` | P | ``margins``, ``observed``, | Check | +| | | ``sort`` | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``margins``, ``observed``, | See ``agg`` | | | | ``sort`` | | diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index 695301bcc1e..c3a3711ed77 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -30,10 +30,9 @@ Function application +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | GroupBy method | Snowpark implemented? (Y/N/P/D) | Missing parameters | Notes for current implementation | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | ``axis`` other than 0 is not | ``Y``, support functions are count, mean, min, max,| -| | | implemented. | sum, median, std, size, len, and var | -| | | | (including both Python and NumPy functions) | -| | | | otherwise ``N``. | +| ``agg`` | P | ``axis`` other than 0 is not | Check | +| | | implemented. | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | ``axis`` other than 0 is not | See ``agg`` | | | | implemented. | | diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 6521d9ffd39..3ec6a23dfa3 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -76,12 +76,9 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``add_suffix`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``agg`` | P | | ``Y`` when function is one of ``count``, | -| | | | ``mean``, ``min``, ``max``, ``sum``, ``median``, | -| | | | ``size``; ``std`` and ``var`` supported with | -| | | | ``ddof=0`` or ``ddof=1``; ``quantile`` is | -| | | | supported when ``q`` is the default value | -| | | | or a scalar. | +| ``agg`` | P | | Check | +| | | | `Supported Aggregation Functions `_ | +| | | | for a list of supported functions. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``aggregate`` | P | | See ``agg`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index 10d1e84c568..ff4636a8bd9 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -284,6 +284,54 @@ def test_groupby_agg_with_float_dtypes_named_agg() -> None: ) +@pytest.mark.parametrize( + "grpby_fn", + [ + lambda gr: gr.quantile(), + lambda gr: gr.quantile(q=0.3), + ], +) +@sql_count_checker(query_count=1) +def test_groupby_agg_quantile_with_int_dtypes(grpby_fn) -> None: + native_df = native_pd.DataFrame( + { + "col1_grp": ["g1", "g2", "g0", "g0", "g2", "g3", "g0", "g2", "g3"], + "col2_int64": np.arange(9, dtype="int64") // 3, + "col3_int_identical": [2] * 9, + "col4_int32": np.arange(9, dtype="int32") // 4, + "col5_int16": np.arange(9, dtype="int16") // 3, + "col6_mixed": np.concatenate( + [ + np.arange(3, dtype="int64") // 3, + np.arange(3, dtype="int32") // 3, + np.arange(3, dtype="int16") // 3, + ] + ), + "col7_int_missing": [5, 6, np.nan, 2, 1, np.nan, 5, np.nan, np.nan], + "col8_mixed_missing": np.concatenate( + [ + np.arange(2, dtype="int64") // 3, + [np.nan], + np.arange(2, dtype="int32") // 3, + [np.nan], + np.arange(2, dtype="int16") // 3, + [np.nan], + ] + ), + } + ) + snowpark_pandas_df = pd.DataFrame(native_df) + by = "col1_grp" + snowpark_pandas_groupby = snowpark_pandas_df.groupby(by=by) + pandas_groupby = native_df.groupby(by=by) + eval_snowpark_pandas_result( + snowpark_pandas_groupby, + pandas_groupby, + grpby_fn, + comparator=assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, + ) + + @sql_count_checker(query_count=2) def test_groupby_agg_with_int_dtypes(int_to_decimal_float_agg_method) -> None: snowpark_pandas_df = pd.DataFrame(