Skip to content

Commit

Permalink
Merge branch 'main' into aalam-SNOW-1644950-add-explicit-option-for-u…
Browse files Browse the repository at this point in the history
…se-logical-type
  • Loading branch information
sfc-gh-aalam authored Aug 30, 2024
2 parents 4db27a2 + 20837fc commit 0db93f4
Show file tree
Hide file tree
Showing 12 changed files with 680 additions and 42 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
- support for lazy `TimedeltaIndex`.
- support for `pd.to_timedelta`.
- support for `GroupBy` aggregations `min`, `max`, `mean`, `idxmax`, `idxmin`, `std`, `sum`, `median`, `count`, `any`, `all`, `size`, `nunique`.
- support for `TimedeltaIndex` attributes: `days`, `seconds`, `microseconds` and `nanoseconds`.
- Added support for index's arithmetic and comparison operators.
- Added support for `Series.dt.round`.
- Added documentation pages for `DatetimeIndex`.
Expand All @@ -76,6 +77,7 @@
- Added support for `Index.is_boolean`, `Index.is_integer`, `Index.is_floating`, `Index.is_numeric`, and `Index.is_object`.
- Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`.
- Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`.
- Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`.

#### Improvements

Expand Down
2 changes: 2 additions & 0 deletions docs/source/modin/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ GroupBy
DataFrameGroupBy.std
DataFrameGroupBy.sum
DataFrameGroupBy.tail
DataFrameGroupBy.value_counts
DataFrameGroupBy.var

.. rubric:: `SeriesGroupBy` computations / descriptive stats
Expand Down Expand Up @@ -90,4 +91,5 @@ GroupBy
SeriesGroupBy.std
SeriesGroupBy.sum
SeriesGroupBy.tail
SeriesGroupBy.value_counts
SeriesGroupBy.var
2 changes: 1 addition & 1 deletion docs/source/modin/supported/groupby_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ Computations/descriptive stats
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``take`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``value_counts`` | N | |
| ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``var`` | P | See ``std`` |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
8 changes: 4 additions & 4 deletions docs/source/modin/supported/timedelta_index_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ Attributes
+-----------------------------+---------------------------------+----------------------------------------------------+
| TimedeltaIndex attribute | Snowpark implemented? (Y/N/P/D) | Notes for current implementation |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``days`` | N | |
| ``days`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``seconds`` | N | |
| ``seconds`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``microseconds`` | N | |
| ``microseconds`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``nanoseconds`` | N | |
| ``nanoseconds`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``components`` | N | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
61 changes: 59 additions & 2 deletions src/snowflake/snowpark/modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
create_groupby_transform_func,
)
from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
from snowflake.snowpark.modin.plugin._internal.utils import INDEX_LABEL
from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import (
SnowflakeQueryCompiler,
)
Expand Down Expand Up @@ -188,13 +189,28 @@ def sem(self, ddof=1):

def value_counts(
self,
subset=None,
subset: Optional[list[str]] = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
):
ErrorMessage.method_not_implemented_error(name="value_counts", class_="GroupBy")
query_compiler = self._query_compiler.groupby_value_counts(
by=self._by,
axis=self._axis,
groupby_kwargs=self._kwargs,
subset=subset,
normalize=normalize,
sort=sort,
ascending=ascending,
dropna=dropna,
)
if self._as_index:
return pd.Series(
query_compiler=query_compiler,
name="proportion" if normalize else "count",
)
return pd.DataFrame(query_compiler=query_compiler)

def mean(
self,
Expand Down Expand Up @@ -1314,6 +1330,47 @@ def get_group(self, name, obj=None):
name="get_group", class_="SeriesGroupBy"
)

def value_counts(
self,
subset: Optional[list[str]] = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins: Optional[int] = None,
dropna: bool = True,
):
# TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions
# Modin upstream defaults to pandas for this method, so we need to either override this or
# rewrite this logic to be friendlier to other backends.
#
# Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter.
qc = self._query_compiler
# The "by" list becomes the new index, which we then perform the group by on. We call
# reset_index to let the query compiler treat it as a data column so it can be grouped on.
if self._by is not None:
qc = (
qc.set_index_from_series(pd.Series(self._by)._query_compiler)
.set_index_names([INDEX_LABEL])
.reset_index()
)
result_qc = qc.groupby_value_counts(
by=[INDEX_LABEL],
axis=self._axis,
groupby_kwargs=self._kwargs,
subset=subset,
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
)
# Reset the names in the MultiIndex
result_qc = result_qc.set_index_names([None] * result_qc.nlevels())
return pd.Series(
query_compiler=result_qc,
name="proportion" if normalize else "count",
)


def validate_groupby_args(
by: Any,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
cast,
convert_timezone,
date_part,
floor,
iff,
to_decimal,
trunc,
)
from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit
from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage
Expand Down Expand Up @@ -176,7 +176,7 @@ def col_to_timedelta(col: Column, unit: str) -> Column:
if not td_unit:
# Same error as native pandas.
raise ValueError(f"invalid unit abbreviation: {unit}")
return cast(floor(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit]), LongType())
return trunc(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit])


PANDAS_DATETIME_FORMAT_TO_SNOWFLAKE_MAPPING = {
Expand Down
Loading

0 comments on commit 0db93f4

Please sign in to comment.