Skip to content

Commit

Permalink
SNOW-1458133 Implement Index.value_counts (#1902)
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-vbudati authored Jul 26, 2024
1 parent a30c6ca commit c88209f
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- Added support for `Series.dt.microsecond` and `Series.dt.nanosecond`.
- Added support for `Index.is_unique` and `Index.has_duplicates`.
- Added support for `Index.equals`.
- Added support for `Index.value_counts`.

#### Improvements
- Removed the public preview warning message upon importing Snowpark pandas.
Expand Down
5 changes: 4 additions & 1 deletion docs/source/modin/supported/index_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ The following table is structured as follows: The first column contains the meth
The second column is a flag for whether or not there is an implementation in Snowpark for
the method in the left column.

Currently, there is no lazy MultiIndex support. This lazy Index object is only a single Index object.
However, existing Snowpark pandas DataFrame and Series APIs may support native pandas MultiIndex objects.

.. note::
``Y`` stands for yes, i.e., supports distributed implementation, ``N`` stands for no and API simply errors out,
``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for defaults to single
Expand Down Expand Up @@ -124,7 +127,7 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``nunique`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``value_counts`` | N | | |
| ``value_counts`` | P | ``bins`` | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``set_names`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import uuid
from collections.abc import Hashable, Iterable, Mapping, Sequence
from datetime import timedelta, tzinfo
from typing import Any, Callable, List, Literal, Optional, Union, get_args
from typing import Any, Callable, List, Literal, Optional, Tuple, Union, get_args

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -10659,6 +10659,45 @@ def resample(

return SnowflakeQueryCompiler(frame)

def value_counts_index(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins: Optional[int] = None,
dropna: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Counts the frequency or number of unique values of Index SnowflakeQueryCompiler.

The resulting object will be in descending order so that the
first element is the most frequently occurring element.
Excludes NA values by default.

Args:
normalize : bool, default False
If True then the object returned will contain the relative
frequencies of the unique values.
sort : bool, default True
Sort by frequencies when True. Preserve the order of the data when False.
ascending : bool, default False
Sort in ascending order.
bins : int, optional
Rather than count values, group them into half-open bins,
a convenience for ``pd.cut``, only works with numeric data.
This argument is not supported yet.
dropna : bool, default True
Don't include counts of NaN.
"""
if bins is not None:
raise ErrorMessage.not_implemented("bins argument is not yet supported")

assert (
not self.is_multiindex()
), "value_counts_index only supports single index objects"
by = self._modin_frame.index_column_pandas_labels
return self._value_counts_groupby(by, normalize, sort, ascending, dropna)

def value_counts(
self,
subset: Optional[Sequence[Hashable]] = None,
Expand All @@ -10669,10 +10708,10 @@ def value_counts(
dropna: bool = True,
) -> "SnowflakeQueryCompiler":
"""
Counts the number of unique values (frequency) of SnowflakeQueryCompiler.
Counts the frequency or number of unique values of SnowflakeQueryCompiler.

The resulting object will be in descending order so that the
first element is the most frequently-occurring element.
first element is the most frequently occurring element.
Excludes NA values by default.

Args:
Expand Down Expand Up @@ -10703,6 +10742,37 @@ def value_counts(
else:
by = self._modin_frame.data_column_pandas_labels

return self._value_counts_groupby(by, normalize, sort, ascending, dropna)

def _value_counts_groupby(
self,
by: Union[List[Hashable], Tuple[Hashable, ...]],
normalize: bool,
sort: bool,
ascending: bool,
dropna: bool,
) -> "SnowflakeQueryCompiler":
"""
Helper method to obtain the frequency or number of unique values
within a group.

The resulting object will be in descending order so that the
first element is the most frequently occurring element.
Excludes NA values by default.

Args:
by : list
Columns to perform value_counts on.
normalize : bool
If True then the object returned will contain the relative
frequencies of the unique values.
sort : bool
Sort by frequencies when True. Preserve the order of the data when False.
ascending : bool
Sort in ascending order.
dropna : bool
Don't include counts of NaN.
"""
# validate whether by is valid (e.g., contains duplicates or non-existing labels)
self.validate_groupby(by=by, axis=0, level=None)

Expand Down
29 changes: 15 additions & 14 deletions src/snowflake/snowpark/modin/plugin/extensions/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1451,21 +1451,20 @@ def value_counts(
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins: Any = None,
bins: int | None = None,
dropna: bool = True,
) -> native_pd.Series:
# how to change the above return type to modin pandas series?
) -> Series:
"""
Return a Series containing counts of unique values.
The resulting object will be in descending order so that the
first element is the most frequently-occurring element.
first element is the most frequently occurring element.
Excludes NA values by default.
Parameters
----------
normalize : bool, default False
If True then the object returned will contain the relative
If True, then the object returned will contain the relative
frequencies of the unique values.
sort : bool, default True
Sort by frequencies when True. Preserve the order of the data when False.
Expand All @@ -1474,13 +1473,14 @@ def value_counts(
bins : int, optional
Rather than count values, group them into half-open bins,
a convenience for ``pd.cut``, only works with numeric data.
`bins` is not yet supported.
dropna : bool, default True
Don't include counts of NaN.
Returns
-------
Series
A series containing counts of unique values.
A Series containing counts of unique values.
See Also
--------
Expand Down Expand Up @@ -1516,14 +1516,15 @@ def value_counts(
apparitions of values, divide the index in the specified
number of half-open bins.
"""
# TODO: SNOW-1458133 implement value_counts
WarningMessage.index_to_pandas_warning("value_counts")
return self.to_pandas().value_counts(
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
return Series(
query_compiler=self._query_compiler.value_counts_index(
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
).set_index_names([self.name]),
name="proportion" if normalize else "count",
)

@is_lazy_check
Expand Down
7 changes: 0 additions & 7 deletions tests/integ/modin/index/test_index_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,6 @@ def test_df_index_equals(native_df):
assert snow_df.index.equals(native_df.index)


@sql_count_checker(query_count=1)
@pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA)
def test_index_value_counts(native_index):
snow_index = pd.Index(native_index)
assert_series_equal(snow_index.value_counts(), native_index.value_counts())


@sql_count_checker(query_count=8)
def test_index_union():
idx1 = pd.Index([1, 2, 3, 4])
Expand Down
Loading

0 comments on commit c88209f

Please sign in to comment.