Merge branch 'main' into aalam-SNOW-1644950-add-explicit-option-for-u…

…se-logical-type
snowflakedb · Aug 30, 2024 · 0db93f4 · 0db93f4
2 parents 4db27a2 + 20837fc
commit 0db93f4
Show file tree

Hide file tree

Showing 12 changed files with 680 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@
   - support for lazy `TimedeltaIndex`.
   - support for `pd.to_timedelta`.
   - support for `GroupBy` aggregations `min`, `max`, `mean`, `idxmax`, `idxmin`, `std`, `sum`, `median`, `count`, `any`, `all`, `size`, `nunique`.
+  - support for `TimedeltaIndex` attributes: `days`, `seconds`, `microseconds` and `nanoseconds`.
 - Added support for index's arithmetic and comparison operators.
 - Added support for `Series.dt.round`.
 - Added documentation pages for `DatetimeIndex`.
@@ -76,6 +77,7 @@
 - Added support for `Index.is_boolean`, `Index.is_integer`, `Index.is_floating`, `Index.is_numeric`, and `Index.is_object`.
 - Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`.
 - Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`.
+- Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`.
 
 #### Improvements
 

diff --git a/docs/source/modin/groupby.rst b/docs/source/modin/groupby.rst
@@ -59,6 +59,7 @@ GroupBy
     DataFrameGroupBy.std
     DataFrameGroupBy.sum
     DataFrameGroupBy.tail
+    DataFrameGroupBy.value_counts
     DataFrameGroupBy.var
 
 .. rubric:: `SeriesGroupBy` computations / descriptive stats
@@ -90,4 +91,5 @@ GroupBy
     SeriesGroupBy.std
     SeriesGroupBy.sum
     SeriesGroupBy.tail
+    SeriesGroupBy.value_counts
     SeriesGroupBy.var
diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst
@@ -166,7 +166,7 @@ Computations/descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``take``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``value_counts``            | N                               |                                                    |
+| ``value_counts``            | P                               | ``N`` if ``bins`` is given for SeriesGroupBy       |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``var``                     | P                               | See ``std``                                        |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/timedelta_index_supported.rst b/docs/source/modin/supported/timedelta_index_supported.rst
@@ -15,13 +15,13 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | TimedeltaIndex attribute    | Snowpark implemented? (Y/N/P/D) | Notes for current implementation                   |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``days``                    | N                               |                                                    |
+| ``days``                    | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``seconds``                 | N                               |                                                    |
+| ``seconds``                 | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``microseconds``            | N                               |                                                    |
+| ``microseconds``            | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``nanoseconds``             | N                               |                                                    |
+| ``nanoseconds``             | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``components``              | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py
@@ -49,6 +49,7 @@
     create_groupby_transform_func,
 )
 from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
+from snowflake.snowpark.modin.plugin._internal.utils import INDEX_LABEL
 from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import (
     SnowflakeQueryCompiler,
 )
@@ -188,13 +189,28 @@ def sem(self, ddof=1):
 
     def value_counts(
         self,
-        subset=None,
+        subset: Optional[list[str]] = None,
         normalize: bool = False,
         sort: bool = True,
         ascending: bool = False,
         dropna: bool = True,
     ):
-        ErrorMessage.method_not_implemented_error(name="value_counts", class_="GroupBy")
+        query_compiler = self._query_compiler.groupby_value_counts(
+            by=self._by,
+            axis=self._axis,
+            groupby_kwargs=self._kwargs,
+            subset=subset,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            dropna=dropna,
+        )
+        if self._as_index:
+            return pd.Series(
+                query_compiler=query_compiler,
+                name="proportion" if normalize else "count",
+            )
+        return pd.DataFrame(query_compiler=query_compiler)
 
     def mean(
         self,
@@ -1314,6 +1330,47 @@ def get_group(self, name, obj=None):
             name="get_group", class_="SeriesGroupBy"
         )
 
+    def value_counts(
+        self,
+        subset: Optional[list[str]] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins: Optional[int] = None,
+        dropna: bool = True,
+    ):
+        # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions
+        # Modin upstream defaults to pandas for this method, so we need to either override this or
+        # rewrite this logic to be friendlier to other backends.
+        #
+        # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter.
+        qc = self._query_compiler
+        # The "by" list becomes the new index, which we then perform the group by on. We call
+        # reset_index to let the query compiler treat it as a data column so it can be grouped on.
+        if self._by is not None:
+            qc = (
+                qc.set_index_from_series(pd.Series(self._by)._query_compiler)
+                .set_index_names([INDEX_LABEL])
+                .reset_index()
+            )
+        result_qc = qc.groupby_value_counts(
+            by=[INDEX_LABEL],
+            axis=self._axis,
+            groupby_kwargs=self._kwargs,
+            subset=subset,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            bins=bins,
+            dropna=dropna,
+        )
+        # Reset the names in the MultiIndex
+        result_qc = result_qc.set_index_names([None] * result_qc.nlevels())
+        return pd.Series(
+            query_compiler=result_qc,
+            name="proportion" if normalize else "count",
+        )
+
 
 def validate_groupby_args(
     by: Any,

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py
@@ -21,9 +21,9 @@
     cast,
     convert_timezone,
     date_part,
-    floor,
     iff,
     to_decimal,
+    trunc,
 )
 from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit
 from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage
@@ -176,7 +176,7 @@ def col_to_timedelta(col: Column, unit: str) -> Column:
     if not td_unit:
         # Same error as native pandas.
         raise ValueError(f"invalid unit abbreviation: {unit}")
-    return cast(floor(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit]), LongType())
+    return trunc(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit])
 
 
 PANDAS_DATETIME_FORMAT_TO_SNOWFLAKE_MAPPING = {