SNOW-1458133 Implement Index.value_counts (#1902)

snowflakedb · Jul 26, 2024 · c88209f · c88209f
1 parent a30c6ca
commit c88209f
Show file tree

Hide file tree

Showing 6 changed files with 291 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,6 +31,7 @@
 - Added support for `Series.dt.microsecond` and `Series.dt.nanosecond`.
 - Added support for `Index.is_unique` and `Index.has_duplicates`.
 - Added support for `Index.equals`.
+- Added support for `Index.value_counts`.
 
 #### Improvements
 - Removed the public preview warning message upon importing Snowpark pandas.

diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -5,6 +5,9 @@ The following table is structured as follows: The first column contains the meth
 The second column is a flag for whether or not there is an implementation in Snowpark for
 the method in the left column.
 
+Currently, there is no lazy MultiIndex support. This lazy Index object is only a single Index object.
+However, existing Snowpark pandas DataFrame and Series APIs may support native pandas MultiIndex objects.
+
 .. note::
     ``Y`` stands for yes, i.e., supports distributed implementation, ``N`` stands for no and API simply errors out,
     ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for defaults to single
@@ -124,7 +127,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``nunique``                 | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``value_counts``            | N                               |                                  |                                                    |
+| ``value_counts``            | P                               | ``bins``                         |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``set_names``               | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -10,7 +10,7 @@
 import uuid
 from collections.abc import Hashable, Iterable, Mapping, Sequence
 from datetime import timedelta, tzinfo
-from typing import Any, Callable, List, Literal, Optional, Union, get_args
+from typing import Any, Callable, List, Literal, Optional, Tuple, Union, get_args
 
 import numpy as np
 import numpy.typing as npt
@@ -10659,6 +10659,45 @@ def resample(
 
         return SnowflakeQueryCompiler(frame)
 
+    def value_counts_index(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins: Optional[int] = None,
+        dropna: bool = True,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Counts the frequency or number of unique values of Index SnowflakeQueryCompiler.
+
+        The resulting object will be in descending order so that the
+        first element is the most frequently occurring element.
+        Excludes NA values by default.
+
+        Args:
+            normalize : bool, default False
+                If True then the object returned will contain the relative
+                frequencies of the unique values.
+            sort : bool, default True
+                Sort by frequencies when True. Preserve the order of the data when False.
+            ascending : bool, default False
+                Sort in ascending order.
+            bins : int, optional
+                Rather than count values, group them into half-open bins,
+                a convenience for ``pd.cut``, only works with numeric data.
+                This argument is not supported yet.
+            dropna : bool, default True
+                Don't include counts of NaN.
+        """
+        if bins is not None:
+            raise ErrorMessage.not_implemented("bins argument is not yet supported")
+
+        assert (
+            not self.is_multiindex()
+        ), "value_counts_index only supports single index objects"
+        by = self._modin_frame.index_column_pandas_labels
+        return self._value_counts_groupby(by, normalize, sort, ascending, dropna)
+
     def value_counts(
         self,
         subset: Optional[Sequence[Hashable]] = None,
@@ -10669,10 +10708,10 @@ def value_counts(
         dropna: bool = True,
     ) -> "SnowflakeQueryCompiler":
         """
-        Counts the number of unique values (frequency) of SnowflakeQueryCompiler.
+        Counts the frequency or number of unique values of SnowflakeQueryCompiler.
 
         The resulting object will be in descending order so that the
-        first element is the most frequently-occurring element.
+        first element is the most frequently occurring element.
         Excludes NA values by default.
 
         Args:
@@ -10703,6 +10742,37 @@ def value_counts(
         else:
             by = self._modin_frame.data_column_pandas_labels
 
+        return self._value_counts_groupby(by, normalize, sort, ascending, dropna)
+
+    def _value_counts_groupby(
+        self,
+        by: Union[List[Hashable], Tuple[Hashable, ...]],
+        normalize: bool,
+        sort: bool,
+        ascending: bool,
+        dropna: bool,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Helper method to obtain the frequency or number of unique values
+        within a group.
+
+        The resulting object will be in descending order so that the
+        first element is the most frequently occurring element.
+        Excludes NA values by default.
+
+        Args:
+            by : list
+                Columns to perform value_counts on.
+            normalize : bool
+                If True then the object returned will contain the relative
+                frequencies of the unique values.
+            sort : bool
+                Sort by frequencies when True. Preserve the order of the data when False.
+            ascending : bool
+                Sort in ascending order.
+            dropna : bool
+                Don't include counts of NaN.
+        """
         # validate whether by is valid (e.g., contains duplicates or non-existing labels)
         self.validate_groupby(by=by, axis=0, level=None)
 

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -1451,21 +1451,20 @@ def value_counts(
         normalize: bool = False,
         sort: bool = True,
         ascending: bool = False,
-        bins: Any = None,
+        bins: int | None = None,
         dropna: bool = True,
-    ) -> native_pd.Series:
-        # how to change the above return type to modin pandas series?
+    ) -> Series:
         """
         Return a Series containing counts of unique values.
 
         The resulting object will be in descending order so that the
-        first element is the most frequently-occurring element.
+        first element is the most frequently occurring element.
         Excludes NA values by default.
 
         Parameters
         ----------
         normalize : bool, default False
-            If True then the object returned will contain the relative
+            If True, then the object returned will contain the relative
             frequencies of the unique values.
         sort : bool, default True
             Sort by frequencies when True. Preserve the order of the data when False.
@@ -1474,13 +1473,14 @@ def value_counts(
         bins : int, optional
             Rather than count values, group them into half-open bins,
             a convenience for ``pd.cut``, only works with numeric data.
+            `bins` is not yet supported.
         dropna : bool, default True
             Don't include counts of NaN.
 
         Returns
         -------
         Series
-            A series containing counts of unique values.
+            A Series containing counts of unique values.
 
         See Also
         --------
@@ -1516,14 +1516,15 @@ def value_counts(
         apparitions of values, divide the index in the specified
         number of half-open bins.
         """
-        # TODO: SNOW-1458133 implement value_counts
-        WarningMessage.index_to_pandas_warning("value_counts")
-        return self.to_pandas().value_counts(
-            normalize=normalize,
-            sort=sort,
-            ascending=ascending,
-            bins=bins,
-            dropna=dropna,
+        return Series(
+            query_compiler=self._query_compiler.value_counts_index(
+                normalize=normalize,
+                sort=sort,
+                ascending=ascending,
+                bins=bins,
+                dropna=dropna,
+            ).set_index_names([self.name]),
+            name="proportion" if normalize else "count",
         )
 
     @is_lazy_check

diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py
@@ -67,13 +67,6 @@ def test_df_index_equals(native_df):
     assert snow_df.index.equals(native_df.index)
 
 
-@sql_count_checker(query_count=1)
-@pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA)
-def test_index_value_counts(native_index):
-    snow_index = pd.Index(native_index)
-    assert_series_equal(snow_index.value_counts(), native_index.value_counts())
-
-
 @sql_count_checker(query_count=8)
 def test_index_union():
     idx1 = pd.Index([1, 2, 3, 4])