From 19538ef8a979dccd016c68e56059eeb35a8ff570 Mon Sep 17 00:00:00 2001 From: Naresh Kumar <113932371+sfc-gh-nkumar@users.noreply.github.com> Date: Fri, 30 Aug 2024 13:44:44 -0700 Subject: [PATCH 1/7] SNOW-1637945: Add support for TimedeltaIndex attributes (#2193) Fixes SNOW-1637945 Add support for TimedeltaIndex attributes `days`, `seconds`, `microseconds`, and `nanoseconds`. --- CHANGELOG.md | 1 + .../supported/timedelta_index_supported.rst | 8 +-- .../modin/plugin/_internal/timestamp_utils.py | 4 +- .../compiler/snowflake_query_compiler.py | 49 +++++++++++++++ .../plugin/extensions/timedelta_index.py | 60 ++++++++++++------- .../index/test_timedelta_index_methods.py | 17 +++++- 6 files changed, 107 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0767d5d3a0a..7baea604bd9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ - support for lazy `TimedeltaIndex`. - support for `pd.to_timedelta`. - support for `GroupBy` aggregations `min`, `max`, `mean`, `idxmax`, `idxmin`, `std`, `sum`, `median`, `count`, `any`, `all`, `size`, `nunique`. + - support for `TimedeltaIndex` attributes: `days`, `seconds`, `microseconds` and `nanoseconds`. - Added support for index's arithmetic and comparison operators. - Added support for `Series.dt.round`. - Added documentation pages for `DatetimeIndex`. diff --git a/docs/source/modin/supported/timedelta_index_supported.rst b/docs/source/modin/supported/timedelta_index_supported.rst index 73abe530fd7..cd5e64b8c98 100644 --- a/docs/source/modin/supported/timedelta_index_supported.rst +++ b/docs/source/modin/supported/timedelta_index_supported.rst @@ -15,13 +15,13 @@ Attributes +-----------------------------+---------------------------------+----------------------------------------------------+ | TimedeltaIndex attribute | Snowpark implemented? (Y/N/P/D) | Notes for current implementation | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``days`` | N | | +| ``days`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``seconds`` | N | | +| ``seconds`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``microseconds`` | N | | +| ``microseconds`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``nanoseconds`` | N | | +| ``nanoseconds`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``components`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py index 380fe965b4d..c4873724789 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py @@ -21,9 +21,9 @@ cast, convert_timezone, date_part, - floor, iff, to_decimal, + trunc, ) from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage @@ -176,7 +176,7 @@ def col_to_timedelta(col: Column, unit: str) -> Column: if not td_unit: # Same error as native pandas. raise ValueError(f"invalid unit abbreviation: {unit}") - return cast(floor(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit]), LongType()) + return trunc(col * TIMEDELTA_UNIT_MULTIPLIER[td_unit]) PANDAS_DATETIME_FORMAT_TO_SNOWFLAKE_MAPPING = { diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 50ce5e71310..079f132f372 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -135,6 +135,7 @@ to_variant, translate, trim, + trunc, uniform, upper, when, @@ -382,6 +383,12 @@ SUPPORTED_DT_FLOOR_CEIL_FREQS = ["day", "hour", "minute", "second"] +SECONDS_PER_DAY = 86400 +NANOSECONDS_PER_SECOND = 10**9 +NANOSECONDS_PER_MICROSECOND = 10**3 +MICROSECONDS_PER_SECOND = 10**6 +NANOSECONDS_PER_DAY = SECONDS_PER_DAY * NANOSECONDS_PER_SECOND + class SnowflakeQueryCompiler(BaseQueryCompiler): """based on: https://modin.readthedocs.io/en/0.11.0/flow/modin/backends/base/query_compiler.html @@ -17514,3 +17521,45 @@ def tz_convert(self, *args: Any, **kwargs: Any) -> None: def tz_localize(self, *args: Any, **kwargs: Any) -> None: ErrorMessage.method_not_implemented_error("tz_convert", "BasePandasDataset") + + def timedelta_property( + self, property_name: str, include_index: bool = False + ) -> "SnowflakeQueryCompiler": + """ + Extract a specified component of from Timedelta. + + Parameters + ---------- + property : {'days', 'seconds', 'microseconds', 'nanoseconds'} + The component to extract. + include_index: Whether to include the index columns in the operation. + + Returns + ------- + A new SnowflakeQueryCompiler with the extracted component. + """ + if not include_index: + assert ( + len(self.columns) == 1 + ), "dt only works for series" # pragma: no cover + + # mapping from the property name to the corresponding snowpark function + property_to_func_map = { + "days": lambda column: trunc(column / NANOSECONDS_PER_DAY), + "seconds": lambda column: trunc(column / NANOSECONDS_PER_SECOND) + % SECONDS_PER_DAY, + "microseconds": lambda column: trunc(column / NANOSECONDS_PER_MICROSECOND) + % MICROSECONDS_PER_SECOND, + "nanoseconds": lambda column: column % NANOSECONDS_PER_MICROSECOND, + } + func = property_to_func_map.get(property_name) + if not func: + class_prefix = ( + "TimedeltaIndex" if include_index else "Series.dt" + ) # pragma: no cover + raise ErrorMessage.not_implemented( + f"Snowpark pandas doesn't yet support the property '{class_prefix}.{property_name}'" + ) # pragma: no cover + return SnowflakeQueryCompiler( + self._modin_frame.apply_snowpark_function_to_columns(func, include_index) + ) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py b/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py index 86ed2a5ded4..dac1a78f740 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/timedelta_index.py @@ -130,7 +130,6 @@ def __init__( } self._init_index(data, _CONSTRUCTOR_DEFAULTS, query_compiler, **kwargs) - @timedelta_index_not_implemented() @property def days(self) -> Index: """ @@ -142,15 +141,18 @@ def days(self) -> Index: Examples -------- - >>> idx = pd.to_timedelta(["0 days", "10 days", "20 days"]) # doctest: +SKIP - >>> idx # doctest: +SKIP - TimedeltaIndex(['0 days', '10 days', '20 days'], - dtype='timedelta64[ns]', freq=None) - >>> idx.days # doctest: +SKIP + >>> idx = pd.to_timedelta(["0 days", "10 days", "20 days"]) + >>> idx + TimedeltaIndex(['0 days', '10 days', '20 days'], dtype='timedelta64[ns]', freq=None) + >>> idx.days Index([0, 10, 20], dtype='int64') """ + return Index( + query_compiler=self._query_compiler.timedelta_property( + "days", include_index=True + ) + ) - @timedelta_index_not_implemented() @property def seconds(self) -> Index: """ @@ -162,15 +164,18 @@ def seconds(self) -> Index: Examples -------- - >>> idx = pd.to_timedelta([1, 2, 3], unit='s') # doctest: +SKIP - >>> idx # doctest: +SKIP - TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03'], - dtype='timedelta64[ns]', freq=None) - >>> idx.seconds # doctest: +SKIP - Index([1, 2, 3], dtype='int32') + >>> idx = pd.to_timedelta([1, 2, 3], unit='s') + >>> idx + TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03'], dtype='timedelta64[ns]', freq=None) + >>> idx.seconds + Index([1, 2, 3], dtype='int64') """ + return Index( + query_compiler=self._query_compiler.timedelta_property( + "seconds", include_index=True + ) + ) - @timedelta_index_not_implemented() @property def microseconds(self) -> Index: """ @@ -182,16 +187,20 @@ def microseconds(self) -> Index: Examples -------- - >>> idx = pd.to_timedelta([1, 2, 3], unit='us') # doctest: +SKIP - >>> idx # doctest: +SKIP + >>> idx = pd.to_timedelta([1, 2, 3], unit='us') + >>> idx TimedeltaIndex(['0 days 00:00:00.000001', '0 days 00:00:00.000002', '0 days 00:00:00.000003'], dtype='timedelta64[ns]', freq=None) - >>> idx.microseconds # doctest: +SKIP - Index([1, 2, 3], dtype='int32') + >>> idx.microseconds + Index([1, 2, 3], dtype='int64') """ + return Index( + query_compiler=self._query_compiler.timedelta_property( + "microseconds", include_index=True + ) + ) - @timedelta_index_not_implemented() @property def nanoseconds(self) -> Index: """ @@ -203,14 +212,19 @@ def nanoseconds(self) -> Index: Examples -------- - >>> idx = pd.to_timedelta([1, 2, 3], unit='ns') # doctest: +SKIP - >>> idx # doctest: +SKIP + >>> idx = pd.to_timedelta([1, 2, 3], unit='ns') + >>> idx TimedeltaIndex(['0 days 00:00:00.000000001', '0 days 00:00:00.000000002', '0 days 00:00:00.000000003'], dtype='timedelta64[ns]', freq=None) - >>> idx.nanoseconds # doctest: +SKIP - Index([1, 2, 3], dtype='int32') + >>> idx.nanoseconds + Index([1, 2, 3], dtype='int64') """ + return Index( + query_compiler=self._query_compiler.timedelta_property( + "nanoseconds", include_index=True + ) + ) @timedelta_index_not_implemented() @property diff --git a/tests/integ/modin/index/test_timedelta_index_methods.py b/tests/integ/modin/index/test_timedelta_index_methods.py index 1baafed24d2..646bd5ee983 100644 --- a/tests/integ/modin/index/test_timedelta_index_methods.py +++ b/tests/integ/modin/index/test_timedelta_index_methods.py @@ -8,6 +8,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import assert_index_equal @sql_count_checker(query_count=3) @@ -54,12 +55,22 @@ def test_non_default_args(kwargs): pd.TimedeltaIndex(query_compiler=idx._query_compiler, **kwargs) -@pytest.mark.parametrize( - "property", ["days", "seconds", "microseconds", "nanoseconds", "inferred_freq"] -) +@pytest.mark.parametrize("property", ["components", "inferred_freq"]) @sql_count_checker(query_count=0) def test_property_not_implemented(property): snow_index = pd.TimedeltaIndex(["1 days", "2 days"]) msg = f"Snowpark pandas does not yet support the property TimedeltaIndex.{property}" with pytest.raises(NotImplementedError, match=msg): getattr(snow_index, property) + + +@pytest.mark.parametrize("attr", ["days", "seconds", "microseconds", "nanoseconds"]) +@sql_count_checker(query_count=1) +def test_timedelta_index_properties(attr): + native_index = native_pd.TimedeltaIndex( + ["1d", "1h", "60s", "1s", "800ms", "5us", "6ns", "1d 3s", "9m 15s 8us", None] + ) + snow_index = pd.Index(native_index) + assert_index_equal( + getattr(snow_index, attr), getattr(native_index, attr), exact=False + ) From 20837fc517766f0bc34222f056a5e1948ae7a16b Mon Sep 17 00:00:00 2001 From: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:23:42 -0700 Subject: [PATCH 2/7] SNOW-1489371: Implement GroupBy.value_counts (#1986) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1489371 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. This PR adds support for GroupBy.value_counts, accepting all parameters except `bin`, which we do not support for DataFrame/Series.value_counts. Upstream modin defaults to pandas for both DataFrameGroupBy/SeriesGroupBy.value_counts, so some of these changes should be eventually upstreamed. pandas has different behavior than what might be expected from documentation; this PR tries to align with existing behavior as much as possible. This is documented in this pandas issue: https://github.com/pandas-dev/pandas/issues/59307 1. When `normalize=True`, pandas sorts by the pre-normalization counts, leading to counterintuitive results. This only matters when `groupby` is called with `sort=False` and `value_counts` with `sort=True`. See test cases for an example. 2. pandas does not always respect the original order of data, depending on the configuration of sort flags in `groupby` and the `value_counts` call itself. The behaviors are as follows (copied from query compiler comments): ``` # pandas currently provides the following behaviors based on the different sort flags. # These behaviors are not entirely consistent with documentation; see this issue for discussion: # https://github.com/pandas-dev/pandas/issues/59307 # # Example data (using pandas 2.2.1 behavior): # >>> df = pd.DataFrame({"X": ["B", "A", "A", "B", "B", "B"], "Y": [4, 1, 3, -2, -1, -1]}) # # 1. groupby(sort=True).value_counts(sort=True) # Sort on non-grouping columns, then sort on frequencies, then sort on grouping columns. # >>> df.groupby("X", sort=True).value_counts(sort=True) # X Y # A 1 1 # 3 1 # B -1 2 # -2 1 # 4 1 # Name: count, dtype: int64 # # 2. groupby(sort=True).value_counts(sort=False) # Sort on non-grouping columns, then sort on grouping columns. # >>> df.groupby("X", sort=True).value_counts(sort=True) # X Y # X Y # A 1 1 # 3 1 # B -2 1 # -1 2 # 4 1 # Name: count, dtype: int64 # # 3. groupby(sort=False).value_counts(sort=True) # Sort on frequencies. # >>> df.groupby("X", sort=False).value_counts(sort=True) # X Y # B -1 2 # 4 1 # A 1 1 # 3 1 # B -2 1 # Name: count, dtype: int64 # # 4. groupby(sort=False).value_counts(sort=False) # Sort on nothing (entries match the order of the original frame). # X Y # B 4 1 # A 1 1 # 3 1 # B -2 1 # -1 2 # Name: count, dtype: int64 # # Lastly, when `normalize` is set with groupby(sort=False).value_counts(sort=True, normalize=True), # pandas will sort by the pre-normalization counts rather than the resulting proportions. As this # is an uncommon edge case, we cannot handle this using existing QC methods efficiently, so we just # update our testing code to account for this. # See comment ``` --------- Co-authored-by: Andong Zhan --- CHANGELOG.md | 1 + docs/source/modin/groupby.rst | 2 + .../modin/supported/groupby_supported.rst | 2 +- .../snowpark/modin/pandas/groupby.py | 61 +++++- .../compiler/snowflake_query_compiler.py | 179 +++++++++++++++- .../modin/plugin/docstrings/groupby.py | 142 ++++++++++++- .../integ/modin/groupby/test_value_counts.py | 194 ++++++++++++++++++ tests/unit/modin/test_groupby_unsupported.py | 2 - 8 files changed, 573 insertions(+), 10 deletions(-) create mode 100644 tests/integ/modin/groupby/test_value_counts.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7baea604bd9..8aab4250764 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,7 @@ - Added support for `Index.is_boolean`, `Index.is_integer`, `Index.is_floating`, `Index.is_numeric`, and `Index.is_object`. - Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`. - Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`. +- Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`. #### Improvements diff --git a/docs/source/modin/groupby.rst b/docs/source/modin/groupby.rst index 97c99ce383d..e27a3bcf547 100644 --- a/docs/source/modin/groupby.rst +++ b/docs/source/modin/groupby.rst @@ -59,6 +59,7 @@ GroupBy DataFrameGroupBy.std DataFrameGroupBy.sum DataFrameGroupBy.tail + DataFrameGroupBy.value_counts DataFrameGroupBy.var .. rubric:: `SeriesGroupBy` computations / descriptive stats @@ -90,4 +91,5 @@ GroupBy SeriesGroupBy.std SeriesGroupBy.sum SeriesGroupBy.tail + SeriesGroupBy.value_counts SeriesGroupBy.var diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst index f9ef001af29..3bcf3538216 100644 --- a/docs/source/modin/supported/groupby_supported.rst +++ b/docs/source/modin/supported/groupby_supported.rst @@ -166,7 +166,7 @@ Computations/descriptive stats +-----------------------------+---------------------------------+----------------------------------------------------+ | ``take`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``value_counts`` | N | | +| ``value_counts`` | P | ``N`` if ``bins`` is given for SeriesGroupBy | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``var`` | P | See ``std`` | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py index a373883317a..de89a48331b 100644 --- a/src/snowflake/snowpark/modin/pandas/groupby.py +++ b/src/snowflake/snowpark/modin/pandas/groupby.py @@ -49,6 +49,7 @@ create_groupby_transform_func, ) from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta +from snowflake.snowpark.modin.plugin._internal.utils import INDEX_LABEL from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -188,13 +189,28 @@ def sem(self, ddof=1): def value_counts( self, - subset=None, + subset: Optional[list[str]] = None, normalize: bool = False, sort: bool = True, ascending: bool = False, dropna: bool = True, ): - ErrorMessage.method_not_implemented_error(name="value_counts", class_="GroupBy") + query_compiler = self._query_compiler.groupby_value_counts( + by=self._by, + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + dropna=dropna, + ) + if self._as_index: + return pd.Series( + query_compiler=query_compiler, + name="proportion" if normalize else "count", + ) + return pd.DataFrame(query_compiler=query_compiler) def mean( self, @@ -1314,6 +1330,47 @@ def get_group(self, name, obj=None): name="get_group", class_="SeriesGroupBy" ) + def value_counts( + self, + subset: Optional[list[str]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins: Optional[int] = None, + dropna: bool = True, + ): + # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.SeriesGroupBy functions + # Modin upstream defaults to pandas for this method, so we need to either override this or + # rewrite this logic to be friendlier to other backends. + # + # Unlike DataFrameGroupBy, SeriesGroupBy has an additional `bins` parameter. + qc = self._query_compiler + # The "by" list becomes the new index, which we then perform the group by on. We call + # reset_index to let the query compiler treat it as a data column so it can be grouped on. + if self._by is not None: + qc = ( + qc.set_index_from_series(pd.Series(self._by)._query_compiler) + .set_index_names([INDEX_LABEL]) + .reset_index() + ) + result_qc = qc.groupby_value_counts( + by=[INDEX_LABEL], + axis=self._axis, + groupby_kwargs=self._kwargs, + subset=subset, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + # Reset the names in the MultiIndex + result_qc = result_qc.set_index_names([None] * result_qc.nlevels()) + return pd.Series( + query_compiler=result_qc, + name="proportion" if normalize else "count", + ) + def validate_groupby_args( by: Any, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 079f132f372..a803eb332e7 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -12,7 +12,7 @@ import uuid from collections.abc import Hashable, Iterable, Mapping, Sequence from datetime import timedelta, tzinfo -from typing import Any, Callable, List, Literal, Optional, Tuple, Union, get_args +from typing import Any, Callable, List, Literal, Optional, Union, get_args import numpy as np import numpy.typing as npt @@ -5041,6 +5041,161 @@ def groupby_all( drop=drop, ) + def groupby_value_counts( + self, + by: Any, + axis: int, + groupby_kwargs: dict[str, Any], + subset: Optional[list[str]], + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins: Optional[int] = None, + dropna: bool = True, + ) -> "SnowflakeQueryCompiler": + level = groupby_kwargs.get("level", None) + as_index = groupby_kwargs.get("as_index", True) + groupby_sort = groupby_kwargs.get("sort", True) + is_supported = check_is_groupby_supported_by_snowflake(by, level, axis) + if not is_supported: + ErrorMessage.not_implemented( + f"Snowpark pandas GroupBy.value_counts {_GROUPBY_UNSUPPORTED_GROUPING_MESSAGE}" + ) + if bins is not None: + raise ErrorMessage.not_implemented("bins argument is not yet supported") + if not is_list_like(by): + by = [by] + if len(set(by) & set(subset or [])): + # Check for overlap between by and subset. Since column names may contain customer data, + # unlike pandas, we do not include the offending labels in the error message. + raise ValueError("Keys in subset cannot be in the groupby column keys") + if subset is not None: + subset_list = subset + else: + # If subset is unspecified, then all columns should be included. + subset_list = self._modin_frame.data_column_pandas_labels + # The grouping columns are always included in the subset. + # Furthermore, the columns of the output must have the grouping columns first, in the order + # that they were specified. + subset_list = by + list(filter(lambda label: label not in by, subset_list)) + + if as_index: + # When as_index=True, the result is a Series with a MultiIndex index. + result = self._value_counts_groupby( + by=subset_list, + # Use sort=False to preserve the original order + sort=False, + normalize=normalize, + ascending=False, + dropna=dropna, + normalize_within_groups=by, + ) + else: + # When as_index=False, the result is a DataFrame where count/proportion is appended as a new named column. + result = self._value_counts_groupby( + by=subset_list, + # Use sort=False to preserve the original order + sort=False, + normalize=normalize, + ascending=False, + dropna=dropna, + normalize_within_groups=by, + ).reset_index() + result = result.set_columns( + result._modin_frame.data_column_pandas_labels[:-1] + + ["proportion" if normalize else "count"] + ) + # pandas currently provides the following behaviors based on the different sort flags. + # These behaviors are not entirely consistent with documentation; see this issue for discussion: + # https://github.com/pandas-dev/pandas/issues/59307 + # + # Example data (using pandas 2.2.1 behavior): + # >>> df = pd.DataFrame({"X": ["B", "A", "A", "B", "B", "B"], "Y": [4, 1, 3, -2, -1, -1]}) + # + # 1. groupby(sort=True).value_counts(sort=True) + # Sort on non-grouping columns, then sort on frequencies, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # A 1 1 + # 3 1 + # B -1 2 + # -2 1 + # 4 1 + # Name: count, dtype: int64 + # + # 2. groupby(sort=True).value_counts(sort=False) + # Sort on non-grouping columns, then sort on grouping columns. + # >>> df.groupby("X", sort=True).value_counts(sort=True) + # X Y + # X Y + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # 4 1 + # Name: count, dtype: int64 + # + # 3. groupby(sort=False).value_counts(sort=True) + # Sort on frequencies. + # >>> df.groupby("X", sort=False).value_counts(sort=True) + # X Y + # B -1 2 + # 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # Name: count, dtype: int64 + # + # 4. groupby(sort=False).value_counts(sort=False) + # Sort on nothing (entries match the order of the original frame). + # X Y + # B 4 1 + # A 1 1 + # 3 1 + # B -2 1 + # -1 2 + # Name: count, dtype: int64 + # + # Lastly, when `normalize` is set with groupby(sort=False).value_counts(sort=True, normalize=True), + # pandas will sort by the pre-normalization counts rather than the resulting proportions. As this + # is an uncommon edge case, we cannot handle this using existing QC methods efficiently, so we just + # update our testing code to account for this. + # See comment on issue: https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 + sort_cols = [] + if groupby_sort: + # When groupby(sort=True), sort the result on the grouping columns + sort_cols = by + ascending_cols = [True] * len(sort_cols) + if sort: + # When sort=True, also sort on the count/proportion column (always the last) + sort_cols.append( + result._modin_frame.data_column_pandas_labels[-1], + ) + ascending_cols.append(ascending) + if groupby_sort: + # When groupby_sort=True, also sort by the non-grouping columns before sorting by + # the count/proportion column. The left-most column (nearest to the grouping columns + # is sorted on last). + # Exclude the grouping columns (always the first) from the sort. + if as_index: + # When as_index is true, the non-grouping columns are part of the index columns + columns_to_filter = result._modin_frame.index_column_pandas_labels + else: + # When as_index is false, the non-grouping columns are part of the data columns + columns_to_filter = result._modin_frame.data_column_pandas_labels + non_grouping_cols = [ + col_label for col_label in columns_to_filter if col_label not in by + ] + sort_cols.extend(non_grouping_cols) + ascending_cols.extend([True] * len(non_grouping_cols)) + return result.sort_rows_by_column_values( + columns=sort_cols, + ascending=ascending_cols, + kind="stable", + na_position="last", + ignore_index=not as_index, # When as_index=False, take the default positional index + ) + def _get_dummies_helper( self, column: Hashable, @@ -11525,11 +11680,13 @@ def value_counts( def _value_counts_groupby( self, - by: Union[List[Hashable], Tuple[Hashable, ...]], + by: Sequence[Hashable], normalize: bool, sort: bool, ascending: bool, dropna: bool, + *, + normalize_within_groups: Optional[list[str]] = None, ) -> "SnowflakeQueryCompiler": """ Helper method to obtain the frequency or number of unique values @@ -11551,6 +11708,10 @@ def _value_counts_groupby( Sort in ascending order. dropna : bool Don't include counts of NaN. + normalize_within_groups : list[str], optional + If set, the normalize parameter will normalize based on the specified groups + rather than the entire dataset. This parameter is exclusive to the Snowpark pandas + query compiler and is only used internally to implement groupby_value_counts. """ self._raise_not_implemented_error_for_timedelta() @@ -11580,9 +11741,21 @@ def _value_counts_groupby( # they are normalized to percentages as [2/(2+1+1), 1/(2+1+1), 1/(2+1+1)] = [0.5, 0.25, 0.25] # by default, ratio_to_report returns a decimal column, whereas pandas returns a float column if normalize: + if normalize_within_groups: + # If normalize_within_groups is set, then the denominator for ratio_to_report should + # be the size of each group instead. + normalize_snowflake_quoted_identifiers = [ + entry[0] + for entry in internal_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + normalize_within_groups + ) + ] + window = Window.partition_by(normalize_snowflake_quoted_identifiers) + else: + window = None internal_frame = query_compiler._modin_frame.project_columns( [COUNT_LABEL], - builtin("ratio_to_report")(col(count_identifier)).over(), + builtin("ratio_to_report")(col(count_identifier)).over(window), ) count_identifier = internal_frame.data_column_snowflake_quoted_identifiers[ 0 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index 05d29f64850..0692647b3f7 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -203,7 +203,108 @@ def sem(): pass def value_counts(): - pass + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + + normalize : bool, default False + Return proportions rather than frequencies. + + Note that when `normalize=True`, `groupby` is called with `sort=False`, and `value_counts` + is called with `sort=True`, Snowpark pandas will order results differently from + native pandas. This occurs because native pandas sorts on frequencies before converting + them to proportions, while Snowpark pandas computes proportions within groups before sorting. + + See issue for details: https://github.com/pandas-dev/pandas/issues/59307 + + sort : bool, default True + Sort by frequencies. + + ascending : bool, default False + Sort in ascending order. + + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + :class:`~snowflake.snowpark.modin.pandas.Series` or :class:`~snowflake.snowpark.modin.pandas.DataFrame` + Series if the groupby as_index is True, otherwise DataFrame. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an additional column with the value_counts. + The column is labelled 'count' or 'proportion', depending on the normalize parameter. + + By default, rows that contain any NA values are omitted from the result. + + By default, the result will be in descending order so that the first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df # doctest: +NORMALIZE_WHITESPACE + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() # doctest: +NORMALIZE_WHITESPACE + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) # doctest: +NORMALIZE_WHITESPACE + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() # doctest: +NORMALIZE_WHITESPACE + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ def mean(): """ @@ -2103,8 +2204,45 @@ def size(): """ pass - def unique(self): + def unique(): pass def apply(): pass + + def value_counts(): + """ + Return a Series or DataFrame containing counts of unique rows. + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + + normalize : bool, default False + Return proportions rather than frequencies. + + Note that when `normalize=True`, `groupby` is called with `sort=False`, and `value_counts` + is called with `sort=True`, Snowpark pandas will order results differently from + native pandas. This occurs because native pandas sorts on frequencies before converting + them to proportions, while Snowpark pandas computes proportions within groups before sorting. + + See issue for details: https://github.com/pandas-dev/pandas/issues/59307 + + sort : bool, default True + Sort by frequencies. + + ascending : bool, default False + Sort in ascending order. + + bins : int, optional + Rather than count values, group them into half-open bins, a convenience for `pd.cut`, only works with numeric data. + This parameter is not yet supported in Snowpark pandas. + + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + :class:`~snowflake.snowpark.modin.pandas.Series` + """ diff --git a/tests/integ/modin/groupby/test_value_counts.py b/tests/integ/modin/groupby/test_value_counts.py new file mode 100644 index 00000000000..1f1b2f5c052 --- /dev/null +++ b/tests/integ/modin/groupby/test_value_counts.py @@ -0,0 +1,194 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equal_to_pandas, + create_test_dfs, + eval_snowpark_pandas_result, +) + +TEST_DATA = [ + { + "by": ["c", "b", "a", "a", "b", "b", "c", "a"], + "value1": ["ee", "aa", "bb", "aa", "bb", "cc", "dd", "aa"], + "value2": [1, 2, 3, 1, 1, 3, 2, 1], + }, + { + "by": ["key 1", None, None, "key 1", "key 2", "key 1"], + "value1": [None, "value", None, None, None, "value"], + "value2": ["value", None, None, None, "value", None], + }, + # Copied from pandas docs + { + "by": ["male", "male", "female", "male", "female", "male"], + "value1": ["low", "medium", "high", "low", "high", "low"], + "value2": ["US", "FR", "US", "FR", "FR", "FR"], + }, +] + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize( + "subset", + [None, ["value1"], ["value2"], ["value1", "value2"]], +) +@pytest.mark.parametrize("dropna", [True, False]) +def test_value_counts_basic( + test_data, by, groupby_sort, sort, ascending, subset, dropna +): + by_list = by if isinstance(by, list) else [by] + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "subset": subset, + "dropna": dropna, + } + if len(set(by_list) & set(subset or [])): + # If subset and by overlap, check for ValueError + # Unlike pandas, we do not surface label names in the error message + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ), + expect_exception=True, + expect_exception_type=ValueError, + expect_exception_match="in subset cannot be in the groupby column keys", + assert_exception_equal=False, + ) + return + with SqlCounter(query_count=1): + none_in_by_col = any(None in test_data[col] for col in by_list) + if not dropna and none_in_by_col: + # when dropna is False, pandas gives a different result because it drops all NaN + # keys in the multiindex + # https://github.com/pandas-dev/pandas/issues/56366 + # as a workaround, replace all Nones in the pandas frame with a sentinel value + # since NaNs are sorted last, we want the sentinel to sort to the end as well + VALUE_COUNTS_TEST_SENTINEL = "zzzzzz" + snow_df, native_df = create_test_dfs(test_data) + snow_result = snow_df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ) + native_df = native_df.fillna(value=VALUE_COUNTS_TEST_SENTINEL) + native_result = native_df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ) + native_result.index = native_result.index.map( + lambda x: tuple( + None if i == VALUE_COUNTS_TEST_SENTINEL else i for i in x + ) + ) + assert_snowpark_pandas_equal_to_pandas(snow_result, native_result) + else: + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ), + ) + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +@sql_count_checker(query_count=1) +def test_value_counts_normalize( + test_data, by, groupby_sort, sort, ascending, normalize +): + value_counts_kwargs = { + "sort": sort, + "ascending": ascending, + "normalize": normalize, + } + # When normalize is set, pandas will (counter-intuitively) sort by the pre-normalization + # counts rather than the result proportions. This only matters if groupby_sort is False + # and sort is True. + # We work around this by using check_like=True + # See https://github.com/pandas-dev/pandas/issues/59307#issuecomment-2313767856 + check_like = not groupby_sort and sort and normalize + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort).value_counts( + **value_counts_kwargs + ), + check_like=check_like, + ) + + +@pytest.mark.parametrize("test_data", TEST_DATA) +@pytest.mark.parametrize("by", ["by", ["value1", "by"], ["by", "value2"]]) +@pytest.mark.parametrize("groupby_sort", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("as_index", [True, False]) +@sql_count_checker(query_count=1) +def test_value_counts_as_index(test_data, by, groupby_sort, sort, as_index): + eval_snowpark_pandas_result( + *create_test_dfs(test_data), + lambda df: df.groupby(by=by, sort=groupby_sort, as_index=as_index).value_counts( + sort=sort + ), + ) + + +@pytest.mark.parametrize( + "subset, exception_cls", + [ + (["bad_key"], KeyError), # key not in frame + (["by"], ValueError), # subset cannot overlap with grouping columns + (["by", "bad_key"], ValueError), # subset cannot overlap with grouping columns + ], +) +def test_value_counts_bad_subset(subset, exception_cls): + # for KeyError, 1 query always runs to validate the length of the by list + with SqlCounter(query_count=1 if exception_cls is KeyError else 0): + eval_snowpark_pandas_result( + *create_test_dfs(TEST_DATA[0]), + lambda x: x.groupby(by=["by"]).value_counts(subset=subset), + expect_exception=True, + expect_exception_type=exception_cls, + assert_exception_equal=False, + ) + + +# An additional query is needed to validate the length of the by list +# A JOIN is needed to set the index to the by list +@sql_count_checker(query_count=2, join_count=1) +def test_value_counts_series(): + by = ["a", "a", "b", "b", "a", "c"] + native_ser = native_pd.Series( + [0, 0, None, 1, None, 3], + ) + snow_ser = pd.Series(native_ser) + eval_snowpark_pandas_result( + snow_ser, native_ser, lambda ser: ser.groupby(by=by).value_counts() + ) + + +# 1 query always runs to validate the length of the by list +@sql_count_checker(query_count=1) +def test_value_counts_bins_unimplemented(): + by = ["a", "a", "b", "b", "a", "c"] + native_ser = native_pd.Series( + [0, 0, None, 1, None, 3], + ) + snow_ser = pd.Series(native_ser) + with pytest.raises(NotImplementedError): + eval_snowpark_pandas_result( + snow_ser, native_ser, lambda ser: ser.groupby(by=by).value_counts(bins=3) + ) diff --git a/tests/unit/modin/test_groupby_unsupported.py b/tests/unit/modin/test_groupby_unsupported.py index efc48724055..6bb27db446f 100644 --- a/tests/unit/modin/test_groupby_unsupported.py +++ b/tests/unit/modin/test_groupby_unsupported.py @@ -39,7 +39,6 @@ (lambda se: se.groupby("A").skew(), "skew"), (lambda se: se.groupby("A").take(2), "take"), (lambda se: se.groupby("A").expanding(), "expanding"), - (lambda se: se.groupby("A").value_counts(), "value_counts"), (lambda se: se.groupby("A").hist(), "hist"), (lambda se: se.groupby("A").plot(), "plot"), (lambda se: se.groupby("A").boxplot("test_group"), "boxplot"), @@ -83,7 +82,6 @@ def test_series_groupby_unsupported_methods_raises( (lambda df: df.groupby("A").skew(), "skew"), (lambda df: df.groupby("A").take(2), "take"), (lambda df: df.groupby("A").expanding(), "expanding"), - (lambda df: df.groupby("A").value_counts(), "value_counts"), (lambda df: df.groupby("A").hist(), "hist"), (lambda df: df.groupby("A").plot(), "plot"), (lambda df: df.groupby("A").boxplot("test_group"), "boxplot"), From 5a3d8e0be17ed869d4d7152a222f06b0b104e33a Mon Sep 17 00:00:00 2001 From: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Date: Fri, 30 Aug 2024 15:17:08 -0700 Subject: [PATCH 3/7] SNOW-1119855: Actually remove base.py (3/2) (#2201) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1119855 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Because of some rebasing issues, #2167 did not actually remove base.py as stated, though all references to it were removed. This PR fixes that. --- src/snowflake/snowpark/modin/pandas/base.py | 4201 ------------------- 1 file changed, 4201 deletions(-) delete mode 100644 src/snowflake/snowpark/modin/pandas/base.py diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py deleted file mode 100644 index 26071049237..00000000000 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ /dev/null @@ -1,4201 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# - -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -# Code in this file may constitute partial or total reimplementation, or modification of -# existing code originally distributed by the Modin project, under the Apache License, -# Version 2.0. - -"""Implement DataFrame/Series public API as pandas does.""" -from __future__ import annotations - -import pickle as pkl -import re -import warnings -from collections.abc import Hashable, Mapping, Sequence -from typing import Any, Callable, Literal, get_args - -import numpy as np -import numpy.typing as npt -import pandas -import pandas.core.generic -import pandas.core.resample -import pandas.core.window.rolling -from pandas._libs import lib -from pandas._libs.lib import NoDefault, is_bool, no_default -from pandas._typing import ( - AggFuncType, - AnyArrayLike, - Axes, - Axis, - CompressionOptions, - DtypeBackend, - FillnaOptions, - IgnoreRaise, - IndexKeyFunc, - IndexLabel, - Level, - NaPosition, - RandomState, - Scalar, - StorageOptions, - TimedeltaConvertibleTypes, - TimestampConvertibleTypes, -) -from pandas.compat import numpy as numpy_compat -from pandas.core.common import apply_if_callable, count_not_none, pipe -from pandas.core.dtypes.common import ( - is_dict_like, - is_dtype_equal, - is_list_like, - is_numeric_dtype, - is_object_dtype, - pandas_dtype, -) -from pandas.core.dtypes.inference import is_integer -from pandas.errors import SpecificationError -from pandas.util._validators import ( - validate_ascending, - validate_bool_kwarg, - validate_percentile, -) - -from snowflake.snowpark.modin import pandas as pd -from snowflake.snowpark.modin.pandas.utils import ( - ensure_index, - extract_validate_and_try_convert_named_aggs_from_kwargs, - get_as_shape_compatible_dataframe_or_series, - is_scalar, - raise_if_native_pandas_objects, - validate_and_try_convert_agg_func_arg_func_to_str, -) -from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta -from snowflake.snowpark.modin.plugin._typing import ListLike -from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage -from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage -from snowflake.snowpark.modin.utils import ( - _inherit_docstrings, - try_cast_to_pandas, - validate_int_kwarg, -) - -# Similar to pandas, sentinel value to use as kwarg in place of None when None has -# special meaning and needs to be distinguished from a user explicitly passing None. -sentinel = object() - -# Do not look up certain attributes in columns or index, as they're used for some -# special purposes, like serving remote context -_ATTRS_NO_LOOKUP = { - "____id_pack__", - "__name__", - "_cache", - "_ipython_canary_method_should_not_exist_", - "_ipython_display_", - "_repr_html_", - "_repr_javascript_", - "_repr_jpeg_", - "_repr_json_", - "_repr_latex_", - "_repr_markdown_", - "_repr_mimebundle_", - "_repr_pdf_", - "_repr_png_", - "_repr_svg_", - "__array_struct__", - "__array_interface__", - "_typ", -} - -_DEFAULT_BEHAVIOUR = { - "__init__", - "__class__", - "_get_index", - "_set_index", - "_pandas_class", - "_get_axis_number", - "empty", - "index", - "columns", - "name", - "dtypes", - "dtype", - "groupby", - "_get_name", - "_set_name", - "_default_to_pandas", - "_query_compiler", - "_to_pandas", - "_repartition", - "_build_repr_df", - "_reduce_dimension", - "__repr__", - "__len__", - "__constructor__", - "_create_or_update_from_compiler", - "_update_inplace", - # for persistance support; - # see DataFrame methods docstrings for more - "_inflate_light", - "_inflate_full", - "__reduce__", - "__reduce_ex__", - "_init", -} | _ATTRS_NO_LOOKUP - - -@_inherit_docstrings( - pandas.DataFrame, - apilink=["pandas.DataFrame", "pandas.Series"], - excluded=[ - pandas.DataFrame.between_time, - pandas.Series.between_time, - pandas.DataFrame.flags, - pandas.Series.flags, - pandas.DataFrame.kurt, - pandas.Series.kurt, - pandas.DataFrame.kurtosis, - pandas.Series.kurtosis, - pandas.DataFrame.rank, - pandas.Series.rank, - pandas.DataFrame.to_csv, - pandas.Series.to_csv, - pandas.DataFrame.sum, - ], -) -class BasePandasDataset(metaclass=TelemetryMeta): - """ - Implement most of the common code that exists in DataFrame/Series. - - Since both objects share the same underlying representation, and the algorithms - are the same, we use this object to define the general behavior of those objects - and then use those objects to define the output type. - - TelemetryMeta is a metaclass that automatically add telemetry decorators to classes/instance methods. - See TelemetryMeta for details. Note: Its subclasses will inherit this metaclass. - """ - - # pandas class that we pretend to be; usually it has the same name as our class - # but lives in "pandas" namespace. - _pandas_class = pandas.core.generic.NDFrame - - @pandas.util.cache_readonly - def _is_dataframe(self) -> bool: - """ - Tell whether this is a dataframe. - - Ideally, other methods of BasePandasDataset shouldn't care whether this - is a dataframe or a series, but sometimes we need to know. This method - is better than hasattr(self, "columns"), which for series will call - self.__getattr__("columns"), which requires materializing the index. - - Returns - ------- - bool : Whether this is a dataframe. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return issubclass(self._pandas_class, pandas.DataFrame) - - def _add_sibling(self, sibling): - """ - Add a DataFrame or Series object to the list of siblings. - - Siblings are objects that share the same query compiler. This function is called - when a shallow copy is made. - - Parameters - ---------- - sibling : BasePandasDataset - Dataset to add to siblings list. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - sibling._siblings = self._siblings + [self] - self._siblings += [sibling] - for sib in self._siblings: - sib._siblings += [sibling] - - def _update_inplace(self, new_query_compiler): - """ - Update the current DataFrame inplace. - - Parameters - ---------- - new_query_compiler : query_compiler - The new QueryCompiler to use to manage the data. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - old_query_compiler = self._query_compiler - self._query_compiler = new_query_compiler - for sib in self._siblings: - sib._query_compiler = new_query_compiler - old_query_compiler.free() - - def _validate_other( - self, - other, - axis, - dtype_check=False, - compare_index=False, - ): - """ - Help to check validity of other in inter-df operations. - - Parameters - ---------- - other : modin.pandas.BasePandasDataset - Another dataset to validate against `self`. - axis : {None, 0, 1} - Specifies axis along which to do validation. When `1` or `None` - is specified, validation is done along `index`, if `0` is specified - validation is done along `columns` of `other` frame. - dtype_check : bool, default: False - Validates that both frames have compatible dtypes. - compare_index : bool, default: False - Compare Index if True. - - Returns - ------- - modin.pandas.BasePandasDataset - Other frame if it is determined to be valid. - - Raises - ------ - ValueError - If `other` is `Series` and its length is different from - length of `self` `axis`. - TypeError - If any validation checks fail. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if isinstance(other, BasePandasDataset): - return other._query_compiler - if not is_list_like(other): - # We skip dtype checking if the other is a scalar. Note that pandas - # is_scalar can be misleading as it is False for almost all objects, - # even when those objects should be treated as scalars. See e.g. - # https://github.com/modin-project/modin/issues/5236. Therefore, we - # detect scalars by checking that `other` is neither a list-like nor - # another BasePandasDataset. - return other - axis = self._get_axis_number(axis) if axis is not None else 1 - result = other - if axis == 0: - if len(other) != len(self._query_compiler.index): - raise ValueError( - f"Unable to coerce to Series, length must be {len(self._query_compiler.index)}: " - + f"given {len(other)}" - ) - else: - if len(other) != len(self._query_compiler.columns): - raise ValueError( - f"Unable to coerce to Series, length must be {len(self._query_compiler.columns)}: " - + f"given {len(other)}" - ) - if hasattr(other, "dtype"): - other_dtypes = [other.dtype] * len(other) - elif is_dict_like(other): - other_dtypes = [ - type(other[label]) - for label in self._query_compiler.get_axis(axis) - # The binary operation is applied for intersection of axis labels - # and dictionary keys. So filtering out extra keys. - if label in other - ] - else: - other_dtypes = [type(x) for x in other] - if compare_index: - if not self.index.equals(other.index): - raise TypeError("Cannot perform operation with non-equal index") - # Do dtype checking. - if dtype_check: - self_dtypes = self._get_dtypes() - if is_dict_like(other): - # The binary operation is applied for the intersection of axis labels - # and dictionary keys. So filtering `self_dtypes` to match the `other` - # dictionary. - self_dtypes = [ - dtype - for label, dtype in zip( - self._query_compiler.get_axis(axis), self._get_dtypes() - ) - if label in other - ] - - # TODO(https://github.com/modin-project/modin/issues/5239): - # this spuriously rejects other that is a list including some - # custom type that can be added to self's elements. - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) - # Check if dtype is timedelta ("m") or datetime ("M") - or ( - lib.is_np_dtype(self_dtype, "mM") - and lib.is_np_dtype(other_dtype, "mM") - ) - or is_dtype_equal(self_dtype, other_dtype) - for self_dtype, other_dtype in zip(self_dtypes, other_dtypes) - ): - raise TypeError("Cannot do operation with improper dtypes") - return result - - def _validate_function(self, func, on_invalid=None): - """ - Check the validity of the function which is intended to be applied to the frame. - - Parameters - ---------- - func : object - on_invalid : callable(str, cls), optional - Function to call in case invalid `func` is met, `on_invalid` takes an error - message and an exception type as arguments. If not specified raise an - appropriate exception. - **Note:** This parameter is a hack to concord with pandas error types. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - - def error_raiser(msg, exception=Exception): - raise exception(msg) - - if on_invalid is None: - on_invalid = error_raiser - - if isinstance(func, dict): - [self._validate_function(fn, on_invalid) for fn in func.values()] - return - # We also could validate this, but it may be quite expensive for lazy-frames - # if not all(idx in self.axes[axis] for idx in func.keys()): - # error_raiser("Invalid dict keys", KeyError) - - if not is_list_like(func): - func = [func] - - for fn in func: - if isinstance(fn, str): - if not (hasattr(self, fn) or hasattr(np, fn)): - on_invalid( - f"{fn} is not valid function for {type(self)} object.", - AttributeError, - ) - elif not callable(fn): - on_invalid( - f"One of the passed functions has an invalid type: {type(fn)}: {fn}, " - + "only callable or string is acceptable.", - TypeError, - ) - - def _binary_op( - self, - op: str, - other: BasePandasDataset, - axis: Axis, - level: Level | None = None, - fill_value: float | None = None, - **kwargs: Any, - ): - """ - Do binary operation between two datasets. - - Parameters - ---------- - op : str - Name of binary operation. - other : modin.pandas.BasePandasDataset - Second operand of binary operation. - axis: Whether to compare by the index (0 or ‘index’) or columns. (1 or ‘columns’). - level: Broadcast across a level, matching Index values on the passed MultiIndex level. - fill_value: Fill existing missing (NaN) values, and any new element needed for - successful DataFrame alignment, with this value before computation. - If data in both corresponding DataFrame locations is missing the result will be missing. - only arithmetic binary operation has this parameter (e.g., add() has, but eq() doesn't have). - - kwargs can contain the following parameters passed in at the frontend: - func: Only used for `combine` method. Function that takes two series as inputs and - return a Series or a scalar. Used to merge the two dataframes column by columns. - - Returns - ------- - modin.pandas.BasePandasDataset - Result of binary operation. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - raise_if_native_pandas_objects(other) - axis = self._get_axis_number(axis) - squeeze_self = isinstance(self, pd.Series) - - # pandas itself will ignore the axis argument when using Series.. - # Per default, it is set to axis=0. However, for the case of a Series interacting with - # a DataFrame the behavior is axis=1. Manually check here for this case and adjust the axis. - - is_lhs_series_and_rhs_dataframe = ( - True - if isinstance(self, pd.Series) and isinstance(other, pd.DataFrame) - else False - ) - - new_query_compiler = self._query_compiler.binary_op( - op=op, - other=other, - axis=1 if is_lhs_series_and_rhs_dataframe else axis, - level=level, - fill_value=fill_value, - squeeze_self=squeeze_self, - **kwargs, - ) - - from snowflake.snowpark.modin.pandas.dataframe import DataFrame - - # Modin Bug: https://github.com/modin-project/modin/issues/7236 - # For a Series interacting with a DataFrame, always return a DataFrame - return ( - DataFrame(query_compiler=new_query_compiler) - if is_lhs_series_and_rhs_dataframe - else self._create_or_update_from_compiler(new_query_compiler) - ) - - def _default_to_pandas(self, op, *args, **kwargs): - """ - Convert dataset to pandas type and call a pandas function on it. - - Parameters - ---------- - op : str - Name of pandas function. - *args : list - Additional positional arguments to be passed to `op`. - **kwargs : dict - Additional keywords arguments to be passed to `op`. - - Returns - ------- - object - Result of operation. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - args = try_cast_to_pandas(args) - kwargs = try_cast_to_pandas(kwargs) - pandas_obj = self._to_pandas() - if callable(op): - result = op(pandas_obj, *args, **kwargs) - elif isinstance(op, str): - # The inner `getattr` is ensuring that we are treating this object (whether - # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` - # will get the operation (`op`) from the pandas version of the class and run - # it on the object after we have converted it to pandas. - attr = getattr(self._pandas_class, op) - if isinstance(attr, property): - result = getattr(pandas_obj, op) - else: - result = attr(pandas_obj, *args, **kwargs) - else: - ErrorMessage.internal_error( - failure_condition=True, - extra_log=f"{op} is an unsupported operation", - ) - # SparseDataFrames cannot be serialized by arrow and cause problems for Modin. - # For now we will use pandas. - if isinstance(result, type(self)) and not isinstance( - result, (pandas.SparseDataFrame, pandas.SparseSeries) - ): - return self._create_or_update_from_compiler( - result, inplace=kwargs.get("inplace", False) - ) - elif isinstance(result, pandas.DataFrame): - from snowflake.snowpark.modin.pandas import DataFrame - - return DataFrame(result) - elif isinstance(result, pandas.Series): - from snowflake.snowpark.modin.pandas import Series - - return Series(result) - # inplace - elif result is None: - return self._create_or_update_from_compiler( - getattr(pd, type(pandas_obj).__name__)(pandas_obj)._query_compiler, - inplace=True, - ) - else: - try: - if ( - isinstance(result, (list, tuple)) - and len(result) == 2 - and isinstance(result[0], pandas.DataFrame) - ): - # Some operations split the DataFrame into two (e.g. align). We need to wrap - # both of the returned results - if isinstance(result[1], pandas.DataFrame): - second = self.__constructor__(result[1]) - else: - second = result[1] - return self.__constructor__(result[0]), second - else: - return result - except TypeError: - return result - - @classmethod - def _get_axis_number(cls, axis): - """ - Convert axis name or number to axis index. - - Parameters - ---------- - axis : int, str or pandas._libs.lib.NoDefault - Axis name ('index' or 'columns') or number to be converted to axis index. - - Returns - ------- - int - 0 or 1 - axis index in the array of axes stored in the dataframe. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if axis is no_default: - axis = None - - return cls._pandas_class._get_axis_number(axis) if axis is not None else 0 - - @pandas.util.cache_readonly - def __constructor__(self): - """ - Construct DataFrame or Series object depending on self type. - - Returns - ------- - modin.pandas.BasePandasDataset - Constructed object. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return type(self) - - def abs(self): # noqa: RT01, D200 - """ - Return a `BasePandasDataset` with absolute numeric value of each element. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__(query_compiler=self._query_compiler.unary_op("abs")) - - def _to_series_list(self, index: pd.Index) -> list[pd.Series]: - """ - Convert index to a list of series - Args: - index: can be single or multi index - - Returns: - the list of series - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if isinstance(index, pd.MultiIndex): - return [ - pd.Series(index.get_level_values(level)) - for level in range(index.nlevels) - ] - elif isinstance(index, pd.Index): - return [pd.Series(index)] - - def _set_index(self, new_index: Axes) -> None: - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - self._update_inplace( - new_query_compiler=self._query_compiler.set_index( - [ - s._query_compiler - for s in self._to_series_list(ensure_index(new_index)) - ] - ) - ) - - def set_axis( - self, - labels: IndexLabel, - *, - axis: Axis = 0, - copy: bool | NoDefault = no_default, - ): - """ - Assign desired index to given axis. - """ - # Behavior based on copy: - # ----------------------------------- - # - In native pandas, copy determines whether to create a copy of the data (not DataFrame). - # - We cannot emulate the native pandas' copy behavior in Snowpark since a copy of only data - # cannot be created -- you can only copy the whole object (DataFrame/Series). - # - # Snowpark behavior: - # ------------------ - # - copy is kept for compatibility with native pandas but is ignored. The user is warned that copy is unused. - # Warn user that copy does not do anything. - if copy is not no_default: - WarningMessage.single_warning( - message=f"{type(self).__name__}.set_axis 'copy' keyword is unused and is ignored." - ) - if labels is None: - raise TypeError("None is not a valid value for the parameter 'labels'.") - - # Determine whether to update self or a copy and perform update. - obj = self.copy() - setattr(obj, axis, labels) - return obj - - def _get_index(self): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from snowflake.snowpark.modin.plugin.extensions.index import Index - - if self._query_compiler.is_multiindex(): - # Lazy multiindex is not supported - return self._query_compiler.index - - idx = Index(query_compiler=self._query_compiler) - idx._set_parent(self) - return idx - - index = property(_get_index, _set_index) - - def add( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Return addition of `BasePandasDataset` and `other`, element-wise (binary operator `add`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "add", other, axis=axis, level=level, fill_value=fill_value - ) - - def aggregate( - self, func: AggFuncType = None, axis: Axis | None = 0, *args: Any, **kwargs: Any - ): - """ - Aggregate using one or more operations over the specified axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from snowflake.snowpark.modin.pandas import Series - - origin_axis = axis - axis = self._get_axis_number(axis) - - if axis == 1 and isinstance(self, Series): - raise ValueError(f"No axis named {origin_axis} for object type Series") - - if len(self._query_compiler.columns) == 0: - # native pandas raise error with message "no result", here we raise a more readable error. - raise ValueError("No column to aggregate on.") - - # If we are using named kwargs, then we do not clear the kwargs (need them in the QC for processing - # order, as well as formatting error messages.) - uses_named_kwargs = False - # If aggregate is called on a Series, named aggregations can be passed in via a dictionary - # to func. - if func is None or (is_dict_like(func) and not self._is_dataframe): - if axis == 1: - raise ValueError( - "`func` must not be `None` when `axis=1`. Named aggregations are not supported with `axis=1`." - ) - if func is not None: - # If named aggregations are passed in via a dictionary to func, then we - # ignore the kwargs. - if any(is_dict_like(value) for value in func.values()): - # We can only get to this codepath if self is a Series, and func is a dictionary. - # In this case, if any of the values of func are themselves dictionaries, we must raise - # a Specification Error, as that is what pandas does. - raise SpecificationError("nested renamer is not supported") - kwargs = func - func = extract_validate_and_try_convert_named_aggs_from_kwargs( - self, allow_duplication=False, axis=axis, **kwargs - ) - uses_named_kwargs = True - else: - func = validate_and_try_convert_agg_func_arg_func_to_str( - agg_func=func, - obj=self, - allow_duplication=False, - axis=axis, - ) - - # This is to stay consistent with pandas result format, when the func is single - # aggregation function in format of callable or str, reduce the result dimension to - # convert dataframe to series, or convert series to scalar. - # Note: When named aggregations are used, the result is not reduced, even if there - # is only a single function. - # needs_reduce_dimension cannot be True if we are using named aggregations, since - # the values for func in that case are either NamedTuples (AggFuncWithLabels) or - # lists of NamedTuples, both of which are list like. - need_reduce_dimension = ( - (callable(func) or isinstance(func, str)) - # A Series should be returned when a single scalar string/function aggregation function, or a - # dict of scalar string/functions is specified. In all other cases (including if the function - # is a 1-element list), the result is a DataFrame. - # - # The examples below have axis=1, but the same logic is applied for axis=0. - # >>> df = pd.DataFrame({"a": [0, 1], "b": [2, 3]}) - # - # single aggregation: return Series - # >>> df.agg("max", axis=1) - # 0 2 - # 1 3 - # dtype: int64 - # - # list of aggregations: return DF - # >>> df.agg(["max"], axis=1) - # max - # 0 2 - # 1 3 - # - # dict where all aggregations are strings: return Series - # >>> df.agg({1: "max", 0: "min"}, axis=1) - # 1 3 - # 0 0 - # dtype: int64 - # - # dict where one element is a list: return DF - # >>> df.agg({1: "max", 0: ["min"]}, axis=1) - # max min - # 1 3.0 NaN - # 0 NaN 0.0 - or ( - is_dict_like(func) - and all(not is_list_like(value) for value in func.values()) - ) - ) - - # If func is a dict, pandas will not respect kwargs for each aggregation function, and - # we should drop them before passing the to the query compiler. - # - # >>> native_pd.DataFrame({"a": [0, 1], "b": [np.nan, 0]}).agg("max", skipna=False, axis=1) - # 0 NaN - # 1 1.0 - # dtype: float64 - # >>> native_pd.DataFrame({"a": [0, 1], "b": [np.nan, 0]}).agg(["max"], skipna=False, axis=1) - # max - # 0 0.0 - # 1 1.0 - # >>> pd.DataFrame([[np.nan], [0]]).aggregate("count", skipna=True, axis=0) - # 0 1 - # dtype: int8 - # >>> pd.DataFrame([[np.nan], [0]]).count(skipna=True, axis=0) - # TypeError: got an unexpected keyword argument 'skipna' - if is_dict_like(func) and not uses_named_kwargs: - kwargs.clear() - - result = self.__constructor__( - query_compiler=self._query_compiler.agg( - func=func, - axis=axis, - args=args, - kwargs=kwargs, - ) - ) - - if need_reduce_dimension: - if self._is_dataframe: - result = Series(query_compiler=result._query_compiler) - - if isinstance(result, Series): - # When func is just "quantile" with a scalar q, result has quantile value as name - q = kwargs.get("q", 0.5) - if func == "quantile" and is_scalar(q): - result.name = q - else: - result.name = None - - # handle case for single scalar (same as result._reduce_dimension()) - if isinstance(self, Series): - return result.to_pandas().squeeze() - - return result - - agg = aggregate - - def _string_function(self, func, *args, **kwargs): - """ - Execute a function identified by its string name. - - Parameters - ---------- - func : str - Function name to call on `self`. - *args : list - Positional arguments to pass to func. - **kwargs : dict - Keyword arguments to pass to func. - - Returns - ------- - object - Function result. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - assert isinstance(func, str) - f = getattr(self, func, None) - if f is not None: - if callable(f): - return f(*args, **kwargs) - assert len(args) == 0 - assert len([kwarg for kwarg in kwargs if kwarg != "axis"]) == 0 - return f - f = getattr(np, func, None) - if f is not None: - return self._default_to_pandas("agg", func, *args, **kwargs) - raise ValueError(f"{func} is an unknown string function") - - def _get_dtypes(self): - """ - Get dtypes as list. - - Returns - ------- - list - Either a one-element list that contains `dtype` if object denotes a Series - or a list that contains `dtypes` if object denotes a DataFrame. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if hasattr(self, "dtype"): - return [self.dtype] - else: - return list(self.dtypes) - - def align( - self, - other, - join="outer", - axis=None, - level=None, - copy=None, - fill_value=None, - method=lib.no_default, - limit=lib.no_default, - fill_axis=lib.no_default, - broadcast_axis=lib.no_default, - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Align two objects on their axes with the specified join method. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if ( - method is not lib.no_default - or limit is not lib.no_default - or fill_axis is not lib.no_default - ): - warnings.warn( # noqa: B028 - "The 'method', 'limit', and 'fill_axis' keywords in " - + f"{type(self).__name__}.align are deprecated and will be removed " - + "in a future version. Call fillna directly on the returned objects " - + "instead.", - FutureWarning, - ) - if fill_axis is lib.no_default: - fill_axis = 0 - if method is lib.no_default: - method = None - if limit is lib.no_default: - limit = None - - if broadcast_axis is not lib.no_default: - msg = ( - f"The 'broadcast_axis' keyword in {type(self).__name__}.align is " - + "deprecated and will be removed in a future version." - ) - if broadcast_axis is not None: - if self.ndim == 1 and other.ndim == 2: - msg += ( - " Use left = DataFrame({col: left for col in right.columns}, " - + "index=right.index) before calling `left.align(right)` instead." - ) - elif self.ndim == 2 and other.ndim == 1: - msg += ( - " Use right = DataFrame({col: right for col in left.columns}, " - + "index=left.index) before calling `left.align(right)` instead" - ) - warnings.warn(msg, FutureWarning) # noqa: B028 - else: - broadcast_axis = None - - left, right = self._query_compiler.align( - other._query_compiler, - join=join, - axis=axis, - level=level, - copy=copy, - fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, - broadcast_axis=broadcast_axis, - ) - return self.__constructor__(query_compiler=left), self.__constructor__( - query_compiler=right - ) - - def all(self, axis=0, bool_only=None, skipna=True, **kwargs): - """ - Return whether all elements are True, potentially over an axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is not None: - axis = self._get_axis_number(axis) - if bool_only and axis == 0: - if hasattr(self, "dtype"): - ErrorMessage.not_implemented( - "{}.{} does not implement numeric_only.".format( - type(self).__name__, "all" - ) - ) # pragma: no cover - data_for_compute = self[self.columns[self.dtypes == np.bool_]] - return data_for_compute.all( - axis=axis, bool_only=False, skipna=skipna, **kwargs - ) - return self._reduce_dimension( - self._query_compiler.all( - axis=axis, bool_only=bool_only, skipna=skipna, **kwargs - ) - ) - else: - if bool_only: - raise ValueError(f"Axis must be 0 or 1 (got {axis})") - # Reduce to a scalar if axis is None. - result = self._reduce_dimension( - # FIXME: Judging by pandas docs `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - self._query_compiler.all( - axis=0, - bool_only=bool_only, - skipna=skipna, - **kwargs, - ) - ) - if isinstance(result, BasePandasDataset): - return result.all( - axis=axis, bool_only=bool_only, skipna=skipna, **kwargs - ) - return result - - def any(self, axis=0, bool_only=None, skipna=True, **kwargs): - """ - Return whether any element is True, potentially over an axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is not None: - axis = self._get_axis_number(axis) - if bool_only and axis == 0: - if hasattr(self, "dtype"): - ErrorMessage.not_implemented( - "{}.{} does not implement numeric_only.".format( - type(self).__name__, "all" - ) - ) # pragma: no cover - data_for_compute = self[self.columns[self.dtypes == np.bool_]] - return data_for_compute.any( - axis=axis, bool_only=False, skipna=skipna, **kwargs - ) - return self._reduce_dimension( - self._query_compiler.any( - axis=axis, bool_only=bool_only, skipna=skipna, **kwargs - ) - ) - else: - if bool_only: - raise ValueError(f"Axis must be 0 or 1 (got {axis})") - # Reduce to a scalar if axis is None. - result = self._reduce_dimension( - self._query_compiler.any( - axis=0, - bool_only=bool_only, - skipna=skipna, - **kwargs, - ) - ) - if isinstance(result, BasePandasDataset): - return result.any( - axis=axis, bool_only=bool_only, skipna=skipna, **kwargs - ) - return result - - def apply( - self, - func, - axis, - broadcast, - raw, - reduce, - result_type, - convert_dtype, - args, - **kwds, - ): # noqa: PR01, RT01, D200 - """ - Apply a function along an axis of the `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - - def error_raiser(msg, exception): - """Convert passed exception to the same type as pandas do and raise it.""" - # HACK: to concord with pandas error types by replacing all of the - # TypeErrors to the AssertionErrors - exception = exception if exception is not TypeError else AssertionError - raise exception(msg) - - self._validate_function(func, on_invalid=error_raiser) - axis = self._get_axis_number(axis) - # TODO SNOW-864025: Support str in series.apply and df.apply - if isinstance(func, str): - # if axis != 1 function can be bounded to the Series, which doesn't - # support axis parameter - if axis == 1: - kwds["axis"] = axis - result = self._string_function(func, *args, **kwds) - if isinstance(result, BasePandasDataset): - return result._query_compiler - return result - # TODO SNOW-856682: Support dict in series.apply and df.apply - elif isinstance(func, dict): - if len(self.columns) != len(set(self.columns)): - WarningMessage.mismatch_with_pandas( - operation="apply", - message="Duplicate column names not supported with apply().", - ) # pragma: no cover - query_compiler = self._query_compiler.apply( - func, - axis, - args=args, - raw=raw, - result_type=result_type, - **kwds, - ) - return query_compiler - - def asfreq( - self, - freq: str, - method: FillnaOptions | None = None, - how: str | None = None, - normalize: bool = False, - fill_value: Scalar = None, - ): # noqa: PR01, RT01, D200 - """ - Convert time series to specified frequency. - """ - return self.__constructor__( - query_compiler=self._query_compiler.asfreq( - freq=freq, - method=method, - how=how, - normalize=normalize, - fill_value=fill_value, - ) - ) - - def asof(self, where, subset=None): # noqa: PR01, RT01, D200 - """ - Return the last row(s) without any NaNs before `where`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - scalar = not is_list_like(where) - if isinstance(where, pandas.Index): - # Prevent accidental mutation of original: - where = where.copy() - else: - if scalar: - where = [where] - where = pandas.Index(where) - - if subset is None: - data = self - else: - # Only relevant for DataFrames: - data = self[subset] - no_na_index = data.dropna().index - new_index = pandas.Index([no_na_index.asof(i) for i in where]) - result = self.reindex(new_index) - result.index = where - - if scalar: - # Need to return a Series: - result = result.squeeze() - return result - - def astype( - self, - dtype: str | type | pd.Series | dict[str, type], - copy: bool = True, - errors: Literal["raise", "ignore"] = "raise", - ) -> pd.DataFrame | pd.Series: - """ - Cast a Modin object to a specified dtype `dtype`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # dtype can be a series, a dict, or a scalar. If it's series or scalar, - # convert it to a dict before passing it to the query compiler. - raise_if_native_pandas_objects(dtype) - from snowflake.snowpark.modin.pandas import Series - - if isinstance(dtype, Series): - dtype = dtype.to_pandas() - if not dtype.index.is_unique: - raise ValueError( - "The new Series of types must have a unique index, i.e. " - + "it must be one-to-one mapping from column names to " - + " their new dtypes." - ) - dtype = dtype.to_dict() - # If we got a series or dict originally, dtype is a dict now. Its keys - # must be column names. - if isinstance(dtype, dict): - # Avoid materializing columns. The query compiler will handle errors where - # dtype dict includes keys that are not in columns. - col_dtypes = dtype - for col_name in col_dtypes: - if col_name not in self._query_compiler.columns: - raise KeyError( - "Only a column name can be used for the key in a dtype mappings argument. " - f"'{col_name}' not found in columns." - ) - else: - # Assume that the dtype is a scalar. - col_dtypes = {column: dtype for column in self._query_compiler.columns} - - # ensure values are pandas dtypes - col_dtypes = {k: pandas_dtype(v) for k, v in col_dtypes.items()} - new_query_compiler = self._query_compiler.astype(col_dtypes, errors=errors) - return self._create_or_update_from_compiler(new_query_compiler, not copy) - - @property - def at(self, axis=None): # noqa: PR01, RT01, D200 - """ - Get a single value for a row/column label pair. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from .indexing import _AtIndexer - - return _AtIndexer(self) - - def at_time(self, time, asof=False, axis=None): # noqa: PR01, RT01, D200 - """ - Select values at particular time of day (e.g., 9:30AM). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if asof: # pragma: no cover - # pandas raises NotImplementedError for asof=True, so we do, too. - raise NotImplementedError("'asof' argument is not supported") - return self.between_time( - start_time=time, end_time=time, inclusive="both", axis=axis - ) - - def backfill( - self, - axis: Axis | None = None, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None = None, - ): - """ - Synonym for `DataFrame.fillna` with ``method='bfill'``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - warnings.warn( - "Series/DataFrame.backfill is deprecated. Use Series/DataFrame.bfill instead.", - FutureWarning, - stacklevel=1, - ) - return self.fillna( - method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - - @_inherit_docstrings( - pandas.DataFrame.between_time, apilink="pandas.DataFrame.between_time" - ) - def between_time( - self: BasePandasDataset, - start_time, - end_time, - inclusive: str | None = None, - axis=None, - ): # noqa: PR01, RT01, D200 - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._create_or_update_from_compiler( - self._query_compiler.between_time( - start_time=pandas.core.tools.times.to_time(start_time), - end_time=pandas.core.tools.times.to_time(end_time), - inclusive=inclusive, - axis=self._get_axis_number(axis), - ) - ) - - def bfill( - self, - axis: Axis | None = None, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None = None, - ): - """ - Synonym for `DataFrame.fillna` with ``method='bfill'``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.fillna( - method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - - def bool(self): # noqa: RT01, D200 - """ - Return the bool of a single element `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - shape = self.shape - if shape != (1,) and shape != (1, 1): - raise ValueError( - """The PandasObject does not have exactly - 1 element. Return the bool of a single - element PandasObject. The truth value is - ambiguous. Use a.empty, a.item(), a.any() - or a.all().""" - ) - else: - return self._to_pandas().bool() - - def clip( - self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs - ): # noqa: PR01, RT01, D200 - """ - Trim values at input threshold(s). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # validate inputs - if axis is not None: - axis = self._get_axis_number(axis) - self._validate_dtypes(numeric_only=True) - inplace = validate_bool_kwarg(inplace, "inplace") - axis = numpy_compat.function.validate_clip_with_axis(axis, args, kwargs) - # any np.nan bounds are treated as None - if lower is not None and np.any(np.isnan(lower)): - lower = None - if upper is not None and np.any(np.isnan(upper)): - upper = None - if is_list_like(lower) or is_list_like(upper): - if axis is None: - raise ValueError("Must specify axis = 0 or 1") - lower = self._validate_other(lower, axis) - upper = self._validate_other(upper, axis) - # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - new_query_compiler = self._query_compiler.clip( - lower=lower, upper=upper, axis=axis, inplace=inplace, *args, **kwargs - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def combine(self, other, func, fill_value=None, **kwargs): # noqa: PR01, RT01, D200 - """ - Perform combination of `BasePandasDataset`-s according to `func`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "combine", other, axis=0, func=func, fill_value=fill_value, **kwargs - ) - - def combine_first(self, other): # noqa: PR01, RT01, D200 - """ - Update null elements with value in the same location in `other`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("combine_first", other, axis=0) - - def copy(self, deep: bool = True): - """ - Make a copy of the object's metadata. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if deep: - return self.__constructor__(query_compiler=self._query_compiler.copy()) - new_obj = self.__constructor__(query_compiler=self._query_compiler) - self._add_sibling(new_obj) - return new_obj - - def count( - self, - axis: Axis | None = 0, - numeric_only: bool = False, - ): - """ - Count non-NA cells for `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._agg_helper( - func="count", - axis=axis, - numeric_only=numeric_only, - ) - - def cummax(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Return cumulative maximum over a `BasePandasDataset` axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - if axis == 1: - self._validate_dtypes(numeric_only=True) - return self.__constructor__( - # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - query_compiler=self._query_compiler.cummax( - fold_axis=axis, axis=axis, skipna=skipna, **kwargs - ) - ) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Return cumulative minimum over a `BasePandasDataset` axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - if axis == 1: - self._validate_dtypes(numeric_only=True) - return self.__constructor__( - # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - query_compiler=self._query_compiler.cummin( - fold_axis=axis, axis=axis, skipna=skipna, **kwargs - ) - ) - - def cumprod( - self, axis=None, skipna=True, *args, **kwargs - ): # noqa: PR01, RT01, D200 - """ - Return cumulative product over a `BasePandasDataset` axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - self._validate_dtypes(numeric_only=True) - return self.__constructor__( - # FIXME: Judging by pandas docs `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - query_compiler=self._query_compiler.cumprod( - fold_axis=axis, axis=axis, skipna=skipna, **kwargs - ) - ) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Return cumulative sum over a `BasePandasDataset` axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - self._validate_dtypes(numeric_only=True) - return self.__constructor__( - # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - query_compiler=self._query_compiler.cumsum( - fold_axis=axis, axis=axis, skipna=skipna, **kwargs - ) - ) - - def describe( - self, - percentiles: ListLike | None = None, - include: ListLike | Literal["all"] | None = None, - exclude: ListLike | None = None, - ) -> BasePandasDataset: - """ - Generate descriptive statistics. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # Upstream modin uses pandas.core.methods.describe._refine_percentiles for this, - # which is not available in pandas 1.5.X - if percentiles is not None: - # explicit conversion of `percentiles` to list - percentiles = list(percentiles) - - # get them all to be in [0, 1] - validate_percentile(percentiles) - - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) - else: - percentiles = np.array([0.25, 0.5, 0.75]) - - data = self - if self._is_dataframe: - # Upstream modin lacks this check because it defaults to pandas for describing empty dataframes - if len(self.columns) == 0: - raise ValueError("Cannot describe a DataFrame without columns") - - # include/exclude are ignored for Series - if (include is None) and (exclude is None): - # when some numerics are found, keep only numerics - default_include: list[npt.DTypeLike] = [np.number] - default_include.append("datetime") - data = self.select_dtypes(include=default_include) - if len(data.columns) == 0: - data = self - elif include == "all": - if exclude is not None: - raise ValueError("exclude must be None when include is 'all'") - data = self - else: - data = self.select_dtypes( - include=include, - exclude=exclude, - ) - # Upstream modin uses data.empty, but that incurs an extra row count query - if self._is_dataframe and len(data.columns) == 0: - # Match pandas error from concatenating empty list of series descriptions. - raise ValueError("No objects to concatenate") - - return self.__constructor__( - query_compiler=data._query_compiler.describe(percentiles=percentiles) - ) - - def diff(self, periods: int = 1, axis: Axis = 0): - """ - First discrete difference of element. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # We must only accept integer (or float values that are whole numbers) - # for periods. - int_periods = validate_int_kwarg(periods, "periods", float_allowed=True) - axis = self._get_axis_number(axis) - return self.__constructor__( - query_compiler=self._query_compiler.diff(axis=axis, periods=int_periods) - ) - - def drop( - self, - labels: IndexLabel = None, - axis: Axis = 0, - index: IndexLabel = None, - columns: IndexLabel = None, - level: Level = None, - inplace: bool = False, - errors: IgnoreRaise = "raise", - ) -> BasePandasDataset | None: - """ - Drop specified labels from `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - inplace = validate_bool_kwarg(inplace, "inplace") - if labels is not None: - if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") - axes = {self._get_axis_number(axis): labels} - elif index is not None or columns is not None: - axes = {0: index, 1: columns} - else: - raise ValueError( - "Need to specify at least one of 'labels', 'index' or 'columns'" - ) - - for axis, labels in axes.items(): - if labels is not None: - if level is not None and not self._query_compiler.has_multiindex( - axis=axis - ): - # Same error as native pandas. - raise AssertionError("axis must be a MultiIndex") - # According to pandas documentation, a tuple will be used as a single - # label and not treated as a list-like. - if not is_list_like(labels) or isinstance(labels, tuple): - axes[axis] = [labels] - - new_query_compiler = self._query_compiler.drop( - index=axes.get(0), columns=axes.get(1), level=level, errors=errors - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def _dropna( - self, - axis: Axis = 0, - how: str | NoDefault = no_default, - thresh: int | NoDefault = no_default, - subset: IndexLabel = None, - inplace: bool = False, - ): - inplace = validate_bool_kwarg(inplace, "inplace") - - if is_list_like(axis): - raise TypeError("supplying multiple axes to axis is no longer supported.") - - axis = self._get_axis_number(axis) - - if (how is not no_default) and (thresh is not no_default): - raise TypeError( - "You cannot set both the how and thresh arguments at the same time." - ) - - if how is no_default: - how = "any" - if how not in ["any", "all"]: - raise ValueError("invalid how option: %s" % how) - if subset is not None: - if axis == 1: - indices = self.index.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - else: - indices = self.columns.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - - new_query_compiler = self._query_compiler.dropna( - axis=axis, - how=how, - thresh=thresh, - subset=subset, - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def droplevel(self, level, axis=0): # noqa: PR01, RT01, D200 - """ - Return `BasePandasDataset` with requested index / column level(s) removed. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - new_axis = self.axes[axis].droplevel(level) - result = self.copy() - if axis == 0: - result.index = new_axis - else: - result.columns = new_axis - return result - - def drop_duplicates( - self, keep="first", inplace=False, **kwargs - ): # noqa: PR01, RT01, D200 - """ - Return `BasePandasDataset` with duplicate rows removed. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = kwargs.get("ignore_index", False) - subset = kwargs.get("subset", None) - if subset is not None: - if is_list_like(subset): - if not isinstance(subset, list): - subset = list(subset) - else: - subset = [subset] - df = self[subset] - else: - df = self - duplicated = df.duplicated(keep=keep) - result = self[~duplicated] - if ignore_index: - result.index = pandas.RangeIndex(stop=len(result)) - if inplace: - self._update_inplace(result._query_compiler) - else: - return result - - def mask( - self, - cond: BasePandasDataset | Callable | AnyArrayLike, - other: BasePandasDataset | Callable | Scalar | None = np.nan, - inplace: bool = False, - axis: Axis | None = None, - level: Level | None = None, - ): - """ - Replace values where the condition is True. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # TODO: https://snowflakecomputing.atlassian.net/browse/SNOW-985670 - # will move pre-processing to QC layer. - inplace = validate_bool_kwarg(inplace, "inplace") - if cond is None: - raise ValueError("Array conditional must be same shape as self") - - cond = apply_if_callable(cond, self) - - if isinstance(cond, Callable): - raise NotImplementedError("Do not support callable for 'cond' parameter.") - - from snowflake.snowpark.modin.pandas import Series - - if isinstance(cond, Series): - cond._query_compiler._shape_hint = "column" - if isinstance(self, Series): - self._query_compiler._shape_hint = "column" - if isinstance(other, Series): - other._query_compiler._shape_hint = "column" - - if not isinstance(cond, BasePandasDataset): - cond = get_as_shape_compatible_dataframe_or_series(cond, self) - cond._query_compiler._shape_hint = "array" - - if other is not None: - other = apply_if_callable(other, self) - - if isinstance(other, np.ndarray): - other = get_as_shape_compatible_dataframe_or_series( - other, - self, - shape_mismatch_message="other must be the same shape as self when an ndarray", - ) - other._query_compiler._shape_hint = "array" - - if isinstance(other, BasePandasDataset): - other = other._query_compiler - - query_compiler = self._query_compiler.mask( - cond._query_compiler, - other, - axis, - level, - ) - - return self._create_or_update_from_compiler(query_compiler, inplace) - - def where( - self, - cond: BasePandasDataset | Callable | AnyArrayLike, - other: BasePandasDataset | Callable | Scalar | None = np.nan, - inplace: bool = False, - axis: Axis | None = None, - level: Level | None = None, - ): - """ - Replace values where the condition is False. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # TODO: SNOW-985670: Refactor `where` and `mask` - # will move pre-processing to QC layer. - inplace = validate_bool_kwarg(inplace, "inplace") - if cond is None: - raise ValueError("Array conditional must be same shape as self") - - cond = apply_if_callable(cond, self) - - if isinstance(cond, Callable): - raise NotImplementedError("Do not support callable for 'cond' parameter.") - - from snowflake.snowpark.modin.pandas import Series - - if isinstance(cond, Series): - cond._query_compiler._shape_hint = "column" - if isinstance(self, Series): - self._query_compiler._shape_hint = "column" - if isinstance(other, Series): - other._query_compiler._shape_hint = "column" - - if not isinstance(cond, BasePandasDataset): - cond = get_as_shape_compatible_dataframe_or_series(cond, self) - cond._query_compiler._shape_hint = "array" - - if other is not None: - other = apply_if_callable(other, self) - - if isinstance(other, np.ndarray): - other = get_as_shape_compatible_dataframe_or_series( - other, - self, - shape_mismatch_message="other must be the same shape as self when an ndarray", - ) - other._query_compiler._shape_hint = "array" - - if isinstance(other, BasePandasDataset): - other = other._query_compiler - - query_compiler = self._query_compiler.where( - cond._query_compiler, - other, - axis, - level, - ) - - return self._create_or_update_from_compiler(query_compiler, inplace) - - def eq(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get equality of `BasePandasDataset` and `other`, element-wise (binary operator `eq`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("eq", other, axis=axis, level=level, dtypes=np.bool_) - - def explode(self, column, ignore_index: bool = False): # noqa: PR01, RT01, D200 - """ - Transform each element of a list-like to a row. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - exploded = self.__constructor__( - query_compiler=self._query_compiler.explode(column) - ) - if ignore_index: - exploded = exploded.reset_index(drop=True) - return exploded - - def ewm( - self, - com: float | None = None, - span: float | None = None, - halflife: float | TimedeltaConvertibleTypes | None = None, - alpha: float | None = None, - min_periods: int | None = 0, - adjust: bool = True, - ignore_na: bool = False, - axis: Axis = 0, - times: str | np.ndarray | BasePandasDataset | None = None, - method: str = "single", - ) -> pandas.core.window.ewm.ExponentialMovingWindow: # noqa: PR01, RT01, D200 - """ - Provide exponentially weighted (EW) calculations. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "ewm", - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - method=method, - ) - - def expanding( - self, min_periods=1, axis=0, method="single" - ): # noqa: PR01, RT01, D200 - """ - Provide expanding window calculations. - """ - from .window import Expanding - - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - name = "expanding" - if axis == 1: - warnings.warn( - f"Support for axis=1 in {type(self).__name__}.{name} is " - + "deprecated and will be removed in a future version. " - + f"Use obj.T.{name}(...) instead", - FutureWarning, - stacklevel=1, - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is " - + "deprecated and will be removed in a future version. " - + "Call the method without the axis keyword instead.", - FutureWarning, - stacklevel=1, - ) - else: - axis = 0 - - return Expanding( - self, - min_periods=min_periods, - axis=axis, - method=method, - ) - - def ffill( - self, - axis: Axis | None = None, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None = None, - ): - """ - Synonym for `DataFrame.fillna` with ``method='ffill'``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.fillna( - method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - - def fillna( - self, - self_is_series, - value: Hashable | Mapping | pd.Series | pd.DataFrame = None, - method: FillnaOptions | None = None, - axis: Axis | None = None, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None = None, - ): - """ - Fill NA/NaN values using the specified method. - - Parameters - ---------- - self_is_series : bool - If True then self contains a Series object, if False then self contains - a DataFrame object. - value : scalar, dict, Series, or DataFrame, default: None - Value to use to fill holes (e.g. 0), alternately a - dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). Values not - in the dict/Series/DataFrame will not be filled. This value cannot - be a list. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default: None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use next valid observation to fill gap. - axis : {None, 0, 1}, default: None - Axis along which to fill missing values. - inplace : bool, default: False - If True, fill in-place. Note: this will modify any - other views on this object (e.g., a no-copy slice for a column in a - DataFrame). - limit : int, default: None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. Must be greater than 0 if not None. - downcast : dict, default: None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). - - Returns - ------- - Series, DataFrame or None - Object with missing values filled or None if ``inplace=True``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - raise_if_native_pandas_objects(value) - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - if isinstance(value, (list, tuple)): - raise TypeError( - '"value" parameter must be a scalar or dict, but ' - + f'you passed a "{type(value).__name__}"' - ) - if value is None and method is None: - # same as pandas - raise ValueError("Must specify a fill 'value' or 'method'.") - if value is not None and method is not None: - raise ValueError("Cannot specify both 'value' and 'method'.") - if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]: - expecting = "pad (ffill) or backfill (bfill)" - msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( - expecting=expecting, method=method - ) - raise ValueError(msg) - if limit is not None: - if not isinstance(limit, int): - raise ValueError("Limit must be an integer") - elif limit <= 0: - raise ValueError("Limit must be greater than 0") - - new_query_compiler = self._query_compiler.fillna( - self_is_series=self_is_series, - value=value, - method=method, - axis=axis, - limit=limit, - downcast=downcast, - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def filter( - self, items=None, like=None, regex=None, axis=None - ): # noqa: PR01, RT01, D200 - """ - Subset the `BasePandasDataset` rows or columns according to the specified index labels. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - nkw = count_not_none(items, like, regex) - if nkw > 1: - raise TypeError( - "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" - ) - if nkw == 0: - raise TypeError("Must pass either `items`, `like`, or `regex`") - if axis is None: - axis = "columns" # This is the default info axis for dataframes - - axis = self._get_axis_number(axis) - labels = self.columns if axis else self.index - - if items is not None: - bool_arr = labels.isin(items) - elif like is not None: - - def f(x): - return like in str(x) - - bool_arr = labels.map(f).tolist() - else: - - def f(x): - return matcher.search(str(x)) is not None - - matcher = re.compile(regex) - bool_arr = labels.map(f).tolist() - if not axis: - return self[bool_arr] - return self[self.columns[bool_arr]] - - def first(self, offset): # noqa: PR01, RT01, D200 - """ - Select initial periods of time series data based on a date offset. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.loc[pandas.Series(index=self.index).first(offset).index] - - def first_valid_index(self) -> Scalar | tuple[Scalar]: - """ - Return index for first non-NA value or None, if no non-NA value is found. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._query_compiler.first_valid_index() - - def floordiv( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get integer division of `BasePandasDataset` and `other`, element-wise (binary operator `floordiv`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "floordiv", other, axis=axis, level=level, fill_value=fill_value - ) - - def ge(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get greater than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ge`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("ge", other, axis=axis, level=level, dtypes=np.bool_) - - def get(self, key, default=None): # noqa: PR01, RT01, D200 - """ - Get item from object for given key. - """ - try: - return self.__getitem__(key) - except (KeyError, ValueError, IndexError): - return default - - def gt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get greater than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `gt`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("gt", other, axis=axis, level=level, dtypes=np.bool_) - - def head(self, n: int = 5): - """ - Return the first `n` rows. - """ - return self.iloc[:n] - - @property - def iat(self, axis=None): # noqa: PR01, RT01, D200 - """ - Get a single value for a row/column pair by integer position. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from .indexing import _iAtIndexer - - return _iAtIndexer(self) - - def idxmax(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01, D200 - """ - Return index of first occurrence of maximum over requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - dtypes = self._get_dtypes() - if ( - axis == 1 - and not numeric_only - and any(not is_numeric_dtype(d) for d in dtypes) - and len(set(dtypes)) > 1 - ): - # For numeric_only=False, if we have any non-numeric dtype, e.g. - # a string type, we need every other column to be of the same type. - # We can't compare two objects of different non-numeric types, e.g. - # a string and a timestamp. - # If we have only numeric data, we can compare columns even if they - # different types, e.g. we can compare an int column to a float - # column. - raise TypeError("'>' not supported for these dtypes") - axis = self._get_axis_number(axis) - return self._reduce_dimension( - self._query_compiler.idxmax( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - ) - - def idxmin(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01, D200 - """ - Return index of first occurrence of minimum over requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - dtypes = self._get_dtypes() - if ( - axis == 1 - and not numeric_only - and any(not is_numeric_dtype(d) for d in dtypes) - and len(set(dtypes)) > 1 - ): - # For numeric_only=False, if we have any non-numeric dtype, e.g. - # a string type, we need every other column to be of the same type. - # We can't compare two objects of different non-numeric types, e.g. - # a string and a timestamp. - # If we have only numeric data, we can compare columns even if they - # different types, e.g. we can compare an int column to a float - # column. - raise TypeError("'<' not supported for these dtypes") - axis = self._get_axis_number(axis) - return self._reduce_dimension( - self._query_compiler.idxmin( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - ) - - def infer_objects( - self, copy: bool | None = None - ) -> BasePandasDataset: # pragma: no cover # noqa: RT01, D200 - """ - Attempt to infer better dtypes for object columns. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - new_query_compiler = self._query_compiler.infer_objects() - return self._create_or_update_from_compiler( - new_query_compiler, inplace=False if copy is None else not copy - ) - - def convert_dtypes( - self, - infer_objects: bool = True, - convert_string: bool = True, - convert_integer: bool = True, - convert_boolean: bool = True, - convert_floating: bool = True, - dtype_backend: DtypeBackend = "numpy_nullable", - ): # noqa: PR01, RT01, D200 - """ - Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__( - query_compiler=self._query_compiler.convert_dtypes( - infer_objects=infer_objects, - convert_string=convert_string, - convert_integer=convert_integer, - convert_boolean=convert_boolean, - convert_floating=convert_floating, - dtype_backend=dtype_backend, - ) - ) - - def isin( - self, values: BasePandasDataset | ListLike | dict[Hashable, ListLike] - ) -> BasePandasDataset: # noqa: PR01, RT01, D200 - """ - Whether elements in `BasePandasDataset` are contained in `values`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - - # Pass as query compiler if values is BasePandasDataset. - if isinstance(values, BasePandasDataset): - values = values._query_compiler - - # Convert non-dict values to List if values is neither List[Any] nor np.ndarray. SnowflakeQueryCompiler - # expects for the non-lazy case, where values is not a BasePandasDataset, the data to be materialized - # as list or numpy array. Because numpy may perform implicit type conversions, use here list to be more general. - elif not isinstance(values, dict) and ( - not isinstance(values, list) or not isinstance(values, np.ndarray) - ): - values = list(values) - - return self.__constructor__( - query_compiler=self._query_compiler.isin(values=values) - ) - - def isna(self): # noqa: RT01, D200 - """ - Detect missing values. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__(query_compiler=self._query_compiler.isna()) - - isnull = isna - - @property - def iloc(self): - """ - Purely integer-location based indexing for selection by position. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # TODO: SNOW-930028 enable all skipped doctests - from .indexing import _iLocIndexer - - return _iLocIndexer(self) - - def kurt(self, axis=no_default, skipna=True, numeric_only=False, **kwargs): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - axis = self._get_axis_number(axis) - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - - data = ( - self._get_numeric_data(axis) - if numeric_only is None or numeric_only - else self - ) - - return self._reduce_dimension( - data._query_compiler.kurt( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - ) - - kurtosis = kurt - - def last(self, offset): # noqa: PR01, RT01, D200 - """ - Select final periods of time series data based on a date offset. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.loc[pandas.Series(index=self.index).last(offset).index] - - def last_valid_index(self) -> Scalar | tuple[Scalar]: - """ - Return index for last non-NA value or None, if no non-NA value is found. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._query_compiler.last_valid_index() - - def le(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get less than or equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `le`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("le", other, axis=axis, level=level, dtypes=np.bool_) - - def lt(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get less than comparison of `BasePandasDataset` and `other`, element-wise (binary operator `lt`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("lt", other, axis=axis, level=level, dtypes=np.bool_) - - @property - def loc(self): - """ - Get a group of rows and columns by label(s) or a boolean array. - """ - # TODO: SNOW-935444 fix doctest where index key has name - # TODO: SNOW-933782 fix multiindex transpose bug, e.g., Name: (cobra, mark ii) => Name: ('cobra', 'mark ii') - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from .indexing import _LocIndexer - - return _LocIndexer(self) - - def _agg_helper( - self, - func: str, - skipna: bool = True, - axis: int | None | NoDefault = no_default, - numeric_only: bool = False, - **kwargs: Any, - ): - if not self._is_dataframe and numeric_only and not is_numeric_dtype(self.dtype): - # Series aggregations on non-numeric data do not support numeric_only: - # https://github.com/pandas-dev/pandas/blob/cece8c6579854f6b39b143e22c11cac56502c4fd/pandas/core/series.py#L6358 - raise TypeError( - f"Series.{func} does not allow numeric_only=True with non-numeric dtypes." - ) - axis = self._get_axis_number(axis) - numeric_only = validate_bool_kwarg( - numeric_only, "numeric_only", none_allowed=True - ) - skipna = validate_bool_kwarg(skipna, "skipna", none_allowed=False) - agg_kwargs: dict[str, Any] = { - "numeric_only": numeric_only, - "skipna": skipna, - } - agg_kwargs.update(kwargs) - return self.aggregate(func=func, axis=axis, **agg_kwargs) - - def max( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs: Any, - ): - """ - Return the maximum of the values over the requested axis. - """ - return self._agg_helper( - func="max", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def _stat_operation( - self, - op_name: str, - axis: int | str, - skipna: bool, - numeric_only: bool = False, - **kwargs, - ): - """ - Do common statistic reduce operations under frame. - - Parameters - ---------- - op_name : str - Name of method to apply. - axis : int or str - Axis to apply method on. - skipna : bool - Exclude NA/null values when computing the result. - numeric_only : bool - Include only float, int, boolean columns. - **kwargs : dict - Additional keyword arguments to pass to `op_name`. - - Returns - ------- - scalar or Series - `scalar` - self is Series - `Series` - self is DataFrame - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if not numeric_only: - self._validate_dtypes(numeric_only=True) - - data = self._get_numeric_data(axis) if numeric_only else self - result_qc = getattr(data._query_compiler, op_name)( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - result_qc = self._reduce_dimension(result_qc) - return result_qc - - def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 - """ - Return the memory usage of the `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._reduce_dimension( - self._query_compiler.memory_usage(index=index, deep=deep) - ) - - def min( - self, - axis: Axis | None | NoDefault = no_default, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ): - """ - Return the minimum of the values over the requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._agg_helper( - func="min", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def mod( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get modulo of `BasePandasDataset` and `other`, element-wise (binary operator `mod`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "mod", other, axis=axis, level=level, fill_value=fill_value - ) - - def mode(self, axis=0, numeric_only=False, dropna=True): # noqa: PR01, RT01, D200 - """ - Get the mode(s) of each element along the selected axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - return self.__constructor__( - query_compiler=self._query_compiler.mode( - axis=axis, numeric_only=numeric_only, dropna=dropna - ) - ) - - def mul( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get multiplication of `BasePandasDataset` and `other`, element-wise (binary operator `mul`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "mul", other, axis=axis, level=level, fill_value=fill_value - ) - - multiply = mul - - def ne(self, other, axis="columns", level=None): # noqa: PR01, RT01, D200 - """ - Get Not equal comparison of `BasePandasDataset` and `other`, element-wise (binary operator `ne`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("ne", other, axis=axis, level=level, dtypes=np.bool_) - - def notna(self): # noqa: RT01, D200 - """ - Detect existing (non-missing) values. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__(query_compiler=self._query_compiler.notna()) - - notnull = notna - - def nunique(self, axis=0, dropna=True): # noqa: PR01, RT01, D200 - """ - Return number of unique elements in the `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - result = self._reduce_dimension( - self._query_compiler.nunique(axis=axis, dropna=dropna) - ) - return result - - def pad( - self, - axis: Axis | None = None, - inplace: bool = False, - limit: int | None = None, - downcast: dict | None = None, - ): - """ - Synonym for `DataFrame.fillna` with ``method='ffill'``. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - warnings.warn( - "Series/DataFrame.pad is deprecated. Use Series/DataFrame.ffill instead.", - FutureWarning, - stacklevel=1, - ) - return self.fillna( - method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - - def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None, **kwargs - ): # noqa: PR01, RT01, D200 - """ - Percentage change between the current and a prior element. - """ - if fill_method not in (lib.no_default, None) or limit is not lib.no_default: - warnings.warn( - "The 'fill_method' keyword being not None and the 'limit' keyword in " - + f"{type(self).__name__}.pct_change are deprecated and will be removed " - + "in a future version. Either fill in any non-leading NA values prior " - + "to calling pct_change or specify 'fill_method=None' to not fill NA " - + "values.", - FutureWarning, - stacklevel=1, - ) - if fill_method is lib.no_default: - warnings.warn( - f"The default fill_method='pad' in {type(self).__name__}.pct_change is " - + "deprecated and will be removed in a future version. Either fill in any " - + "non-leading NA values prior to calling pct_change or specify 'fill_method=None' " - + "to not fill NA values.", - FutureWarning, - stacklevel=1, - ) - fill_method = "pad" - - if limit is lib.no_default: - limit = None - - if "axis" in kwargs: - kwargs["axis"] = self._get_axis_number(kwargs["axis"]) - - # Attempting to match pandas error behavior here - if not isinstance(periods, int): - raise TypeError(f"periods must be an int. got {type(periods)} instead") - - # Attempting to match pandas error behavior here - for dtype in self._get_dtypes(): - if not is_numeric_dtype(dtype): - raise TypeError( - f"cannot perform pct_change on non-numeric column with dtype {dtype}" - ) - - return self.__constructor__( - query_compiler=self._query_compiler.pct_change( - periods=periods, - fill_method=fill_method, - limit=limit, - freq=freq, - **kwargs, - ) - ) - - def pipe(self, func, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Apply chainable functions that expect `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return pipe(self, func, *args, **kwargs) - - def pop(self, item): # noqa: PR01, RT01, D200 - """ - Return item and drop from frame. Raise KeyError if not found. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - result = self[item] - del self[item] - return result - - def pow( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get exponential power of `BasePandasDataset` and `other`, element-wise (binary operator `pow`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "pow", other, axis=axis, level=level, fill_value=fill_value - ) - - def quantile( - self, - q: Scalar | ListLike = 0.5, - axis: Axis = 0, - numeric_only: bool = False, - interpolation: Literal[ - "linear", "lower", "higher", "midpoint", "nearest" - ] = "linear", - method: Literal["single", "table"] = "single", - ) -> float | BasePandasDataset: - """ - Return values at the given quantile over requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - - # TODO - # - SNOW-1008361: support axis=1 - # - SNOW-1008367: support when q is Snowpandas DF/Series (need to require QC interface to accept QC q values) - # - SNOW-1003587: support datetime/timedelta columns - - if ( - axis == 1 - or interpolation not in ["linear", "nearest"] - or method != "single" - ): - ErrorMessage.not_implemented( - f"quantile function with parameters axis={axis}, interpolation={interpolation}, method={method} not supported" - ) - - if not numeric_only: - # If not numeric_only and columns, then check all columns are either - # numeric, timestamp, or timedelta - # Check if dtype is numeric, timedelta ("m"), or datetime ("M") - if not axis and not all( - is_numeric_dtype(t) or lib.is_np_dtype(t, "mM") - for t in self._get_dtypes() - ): - raise TypeError("can't multiply sequence by non-int of type 'float'") - # If over rows, then make sure that all dtypes are equal for not - # numeric_only - elif axis: - for i in range(1, len(self._get_dtypes())): - pre_dtype = self._get_dtypes()[i - 1] - curr_dtype = self._get_dtypes()[i] - if not is_dtype_equal(pre_dtype, curr_dtype): - raise TypeError( - "Cannot compare type '{}' with type '{}'".format( - pre_dtype, curr_dtype - ) - ) - else: - # Normally pandas returns this near the end of the quantile, but we - # can't afford the overhead of running the entire operation before - # we error. - if not any(is_numeric_dtype(t) for t in self._get_dtypes()): - raise ValueError("need at least one array to concatenate") - - # check that all qs are between 0 and 1 - validate_percentile(q) - axis = self._get_axis_number(axis) - query_compiler = self._query_compiler.quantiles_along_axis0( - q=q if is_list_like(q) else [q], - numeric_only=numeric_only, - interpolation=interpolation, - method=method, - ) - if is_list_like(q): - return self.__constructor__(query_compiler=query_compiler) - else: - # result is either a scalar or Series - result = self._reduce_dimension(query_compiler.transpose_single_row()) - if isinstance(result, BasePandasDataset): - result.name = q - return result - - @_inherit_docstrings(pandas.DataFrame.rank, apilink="pandas.DataFrame.rank") - def rank( - self, - axis=0, - method: str = "average", - numeric_only: bool = False, - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - return self.__constructor__( - query_compiler=self._query_compiler.rank( - axis=axis, - method=method, - numeric_only=numeric_only, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - - def _copy_index_metadata(self, source, destination): # noqa: PR01, RT01, D200 - """ - Copy Index metadata from `source` to `destination` inplace. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if hasattr(source, "name") and hasattr(destination, "name"): - destination.name = source.name - if hasattr(source, "names") and hasattr(destination, "names"): - destination.names = source.names - return destination - - def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 - """ - Ensure that we have an index from some index-like object. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if ( - self._query_compiler.has_multiindex(axis=axis) - and not isinstance(index_like, pandas.Index) - and is_list_like(index_like) - and len(index_like) > 0 - and isinstance(index_like[0], tuple) - ): - try: - return pandas.MultiIndex.from_tuples(index_like) - except TypeError: - # not all tuples - pass - return ensure_index(index_like) - - def reindex( - self, - index=None, - columns=None, - copy=True, - **kwargs, - ): # noqa: PR01, RT01, D200 - """ - Conform `BasePandasDataset` to new index with optional filling logic. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if kwargs.get("limit", None) is not None and kwargs.get("method", None) is None: - raise ValueError( - "limit argument only valid if doing pad, backfill or nearest reindexing" - ) - new_query_compiler = None - if index is not None: - if not isinstance(index, pandas.Index) or not index.equals(self.index): - new_query_compiler = self._query_compiler.reindex( - axis=0, labels=index, **kwargs - ) - if new_query_compiler is None: - new_query_compiler = self._query_compiler - final_query_compiler = None - if columns is not None: - if not isinstance(index, pandas.Index) or not columns.equals(self.columns): - final_query_compiler = new_query_compiler.reindex( - axis=1, labels=columns, **kwargs - ) - if final_query_compiler is None: - final_query_compiler = new_query_compiler - return self._create_or_update_from_compiler( - final_query_compiler, inplace=False if copy is None else not copy - ) - - def rename_axis( - self, - mapper=lib.no_default, - *, - index=lib.no_default, - columns=lib.no_default, - axis=0, - copy=None, - inplace=False, - ): # noqa: PR01, RT01, D200 - """ - Set the name of the axis for the index or columns. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axes = {"index": index, "columns": columns} - - if copy is None: - copy = True - - if axis is not None: - axis = self._get_axis_number(axis) - else: - axis = 0 - - inplace = validate_bool_kwarg(inplace, "inplace") - - if mapper is not lib.no_default and mapper is not None: - # Use v0.23 behavior if a scalar or list - non_mapper = is_scalar(mapper) or ( - is_list_like(mapper) and not is_dict_like(mapper) - ) - if non_mapper: - return self._set_axis_name(mapper, axis=axis, inplace=inplace) - else: - raise ValueError("Use `.rename` to alter labels with a mapper.") - else: - # Use new behavior. Means that index and/or columns is specified - result = self if inplace else self.copy(deep=copy) - - for axis in range(self.ndim): - v = axes.get(pandas.DataFrame._get_axis_name(axis)) - if v is lib.no_default: - continue - non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) - if non_mapper: - newnames = v - else: - - def _get_rename_function(mapper): - if isinstance(mapper, (dict, BasePandasDataset)): - - def f(x): - if x in mapper: - return mapper[x] - else: - return x - - else: - f = mapper - - return f - - f = _get_rename_function(v) - curnames = self.index.names if axis == 0 else self.columns.names - newnames = [f(name) for name in curnames] - result._set_axis_name(newnames, axis=axis, inplace=True) - if not inplace: - return result - - def reorder_levels(self, order, axis=0): # noqa: PR01, RT01, D200 - """ - Rearrange index levels using input order. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - new_labels = self.axes[axis].reorder_levels(order) - return self.set_axis(new_labels, axis=axis) - - def resample( - self, - rule, - axis: Axis = lib.no_default, - closed: str | None = None, - label: str | None = None, - convention: str = "start", - kind: str | None = None, - on: Level = None, - level: Level = None, - origin: str | TimestampConvertibleTypes = "start_day", - offset: TimedeltaConvertibleTypes | None = None, - group_keys=no_default, - ): # noqa: PR01, RT01, D200 - """ - Resample time-series data. - """ - from .resample import Resampler - - if axis is not lib.no_default: # pragma: no cover - axis = self._get_axis_number(axis) - if axis == 1: - warnings.warn( - "DataFrame.resample with axis=1 is deprecated. Do " - + "`frame.T.resample(...)` without axis instead.", - FutureWarning, - stacklevel=1, - ) - else: - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.resample is " - + "deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=1, - ) - else: - axis = 0 - - return Resampler( - dataframe=self, - rule=rule, - axis=axis, - closed=closed, - label=label, - convention=convention, - kind=kind, - on=on, - level=level, - origin=origin, - offset=offset, - group_keys=group_keys, - ) - - def reset_index( - self, - level: IndexLabel = None, - drop: bool = False, - inplace: bool = False, - col_level: Hashable = 0, - col_fill: Hashable = "", - allow_duplicates=no_default, - names: Hashable | Sequence[Hashable] = None, - ): - """ - Reset the index, or a level of it. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - inplace = validate_bool_kwarg(inplace, "inplace") - if allow_duplicates is no_default: - allow_duplicates = False - new_query_compiler = self._query_compiler.reset_index( - drop=drop, - level=level, - col_level=col_level, - col_fill=col_fill, - allow_duplicates=allow_duplicates, - names=names, - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def radd( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Return addition of `BasePandasDataset` and `other`, element-wise (binary operator `radd`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "radd", other, axis=axis, level=level, fill_value=fill_value - ) - - def rfloordiv( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get integer division of `BasePandasDataset` and `other`, element-wise (binary operator `rfloordiv`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rfloordiv", other, axis=axis, level=level, fill_value=fill_value - ) - - def rmod( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get modulo of `BasePandasDataset` and `other`, element-wise (binary operator `rmod`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rmod", other, axis=axis, level=level, fill_value=fill_value - ) - - def rmul( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get Multiplication of dataframe and other, element-wise (binary operator `rmul`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rmul", other, axis=axis, level=level, fill_value=fill_value - ) - - def rolling( - self, - window, - min_periods: int | None = None, - center: bool = False, - win_type: str | None = None, - on: str | None = None, - axis: Axis = lib.no_default, - closed: str | None = None, - step: int | None = None, - method: str = "single", - ): # noqa: PR01, RT01, D200 - """ - Provide rolling window calculations. - """ - if axis is not lib.no_default: - axis = self._get_axis_number(axis) - name = "rolling" - if axis == 1: - warnings.warn( - f"Support for axis=1 in {type(self).__name__}.{name} is " - + "deprecated and will be removed in a future version. " - + f"Use obj.T.{name}(...) instead", - FutureWarning, - stacklevel=1, - ) - else: # pragma: no cover - warnings.warn( - f"The 'axis' keyword in {type(self).__name__}.{name} is " - + "deprecated and will be removed in a future version. " - + "Call the method without the axis keyword instead.", - FutureWarning, - stacklevel=1, - ) - else: - axis = 0 - - if win_type is not None: - from .window import Window - - return Window( - self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - step=step, - method=method, - ) - from .window import Rolling - - return Rolling( - self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - step=step, - method=method, - ) - - def round(self, decimals=0, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Round a `BasePandasDataset` to a variable number of decimal places. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility - # purpose and does not affect the result, we shouldn't pass them to the query compiler. - return self.__constructor__( - query_compiler=self._query_compiler.round(decimals=decimals, **kwargs) - ) - - def rpow( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get exponential power of `BasePandasDataset` and `other`, element-wise (binary operator `rpow`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rpow", other, axis=axis, level=level, fill_value=fill_value - ) - - def rsub( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get subtraction of `BasePandasDataset` and `other`, element-wise (binary operator `rsub`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rsub", other, axis=axis, level=level, fill_value=fill_value - ) - - def rtruediv( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get floating division of `BasePandasDataset` and `other`, element-wise (binary operator `rtruediv`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "rtruediv", other, axis=axis, level=level, fill_value=fill_value - ) - - rdiv = rtruediv - - def sample( - self, - n: int | None = None, - frac: float | None = None, - replace: bool = False, - weights: str | np.ndarray | None = None, - random_state: RandomState | None = None, - axis: Axis | None = None, - ignore_index: bool = False, - ): - """ - Return a random sample of items from an axis of object. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if self._get_axis_number(axis): - if weights is not None and isinstance(weights, str): - raise ValueError( - "Strings can only be passed to weights when sampling from rows on a DataFrame" - ) - else: - if n is None and frac is None: - n = 1 - elif n is not None and frac is not None: - raise ValueError("Please enter a value for `frac` OR `n`, not both") - else: - if n is not None: - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide `n` >= 0." - ) - if n % 1 != 0: - raise ValueError("Only integers accepted as `n` values") - else: - if frac < 0: - raise ValueError( - "A negative number of rows requested. Please provide `frac` >= 0." - ) - - query_compiler = self._query_compiler.sample( - n, frac, replace, weights, random_state, axis, ignore_index - ) - return self.__constructor__(query_compiler=query_compiler) - - def sem( - self, - axis: Axis | None = None, - skipna: bool = True, - ddof: int = 1, - numeric_only=False, - **kwargs, - ): # noqa: PR01, RT01, D200 - """ - Return unbiased standard error of the mean over requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._stat_operation( - "sem", axis, skipna, numeric_only, ddof=ddof, **kwargs - ) - - def mean( - self, - axis: Axis | None | NoDefault = no_default, - skipna: bool = True, - numeric_only: bool = False, - **kwargs: Any, - ): - """ - Return the mean of the values over the requested axis. - """ - return self._agg_helper( - func="mean", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def median( - self, - axis: Axis | None | NoDefault = no_default, - skipna: bool = True, - numeric_only: bool = False, - **kwargs: Any, - ): - """ - Return the mean of the values over the requested axis. - """ - return self._agg_helper( - func="median", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def set_flags( - self, *, copy: bool = False, allows_duplicate_labels: bool | None = None - ): # noqa: PR01, RT01, D200 - """ - Return a new `BasePandasDataset` with updated flags. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - pandas.DataFrame.set_flags, - copy=copy, - allows_duplicate_labels=allows_duplicate_labels, - ) - - @property - def flags(self): - return self._default_to_pandas(lambda df: df.flags) - - def shift( - self, - periods: int | Sequence[int] = 1, - freq=None, - axis: Axis = 0, - fill_value: Hashable = no_default, - suffix: str | None = None, - ) -> BasePandasDataset: - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if periods == 0 and freq is None: - # Check obvious case first, freq manipulates the index even for periods == 0 so check for it in addition. - return self.copy() - - # pandas compatible ValueError for freq='infer' - # TODO: Test as part of SNOW-1023324. - if freq == "infer": # pragma: no cover - if not hasattr(self, "freq") and not hasattr( # pragma: no cover - self, "inferred_freq" # pragma: no cover - ): # pragma: no cover - raise ValueError() # pragma: no cover - - axis = self._get_axis_number(axis) - - if fill_value == no_default: - fill_value = None - - new_query_compiler = self._query_compiler.shift( - periods, freq, axis, fill_value, suffix - ) - return self._create_or_update_from_compiler(new_query_compiler, False) - - def skew( - self, - axis: Axis | None | NoDefault = no_default, - skipna: bool = True, - numeric_only=True, - **kwargs, - ): # noqa: PR01, RT01, D200 - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - """ - Return unbiased skew over requested axis. - """ - return self._stat_operation("skew", axis, skipna, numeric_only, **kwargs) - - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, - ignore_index: bool = False, - key: IndexKeyFunc | None = None, - ): # noqa: PR01, RT01, D200 - """ - Sort object by labels (along an axis). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - # pandas throws this exception. See pandas issue #39434 - if ascending is None: - raise ValueError( - "the `axis` parameter is not supported in the pandas implementation of argsort()" - ) - axis = self._get_axis_number(axis) - inplace = validate_bool_kwarg(inplace, "inplace") - new_query_compiler = self._query_compiler.sort_index( - axis=axis, - level=level, - ascending=ascending, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - key=key, - ) - return self._create_or_update_from_compiler(new_query_compiler, inplace) - - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace: bool = False, - kind="quicksort", - na_position="last", - ignore_index: bool = False, - key: IndexKeyFunc | None = None, - ): # noqa: PR01, RT01, D200 - """ - Sort by the values along either axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - inplace = validate_bool_kwarg(inplace, "inplace") - ascending = validate_ascending(ascending) - if axis == 0: - # If any column is None raise KeyError (same a native pandas). - if by is None or (isinstance(by, list) and None in by): - # Same error message as native pandas. - raise KeyError(None) - if not isinstance(by, list): - by = [by] - - # Convert 'ascending' to sequence if needed. - if not isinstance(ascending, Sequence): - ascending = [ascending] * len(by) - if len(by) != len(ascending): - # Same error message as native pandas. - raise ValueError( - f"Length of ascending ({len(ascending)})" - f" != length of by ({len(by)})" - ) - - columns = self._query_compiler.columns.values.tolist() - index_names = self._query_compiler.get_index_names() - for by_col in by: - col_count = columns.count(by_col) - index_count = index_names.count(by_col) - if col_count == 0 and index_count == 0: - # Same error message as native pandas. - raise KeyError(by_col) - if col_count and index_count: - # Same error message as native pandas. - raise ValueError( - f"'{by_col}' is both an index level and a column label, which is ambiguous." - ) - if col_count > 1: - # Same error message as native pandas. - raise ValueError(f"The column label '{by_col}' is not unique.") - - if na_position not in get_args(NaPosition): - # Same error message as native pandas for invalid 'na_position' value. - raise ValueError(f"invalid na_position: {na_position}") - result = self._query_compiler.sort_rows_by_column_values( - by, - ascending=ascending, - kind=kind, - na_position=na_position, - ignore_index=ignore_index, - key=key, - ) - else: - result = self._query_compiler.sort_columns_by_row_values( - by, - ascending=ascending, - kind=kind, - na_position=na_position, - ignore_index=ignore_index, - key=key, - ) - return self._create_or_update_from_compiler(result, inplace) - - def std( - self, - axis: Axis | None = None, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ): - """ - Return sample standard deviation over requested axis. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - kwargs.update({"ddof": ddof}) - return self._agg_helper( - func="std", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( - self, - axis: Axis | None = None, - skipna: bool = True, - numeric_only: bool = False, - min_count: int = 0, - **kwargs: Any, - ): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - min_count = validate_int_kwarg(min_count, "min_count") - kwargs.update({"min_count": min_count}) - return self._agg_helper( - func="sum", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def sub( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get subtraction of `BasePandasDataset` and `other`, element-wise (binary operator `sub`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "sub", other, axis=axis, level=level, fill_value=fill_value - ) - - subtract = sub - - def swapaxes(self, axis1, axis2, copy=True): # noqa: PR01, RT01, D200 - """ - Interchange axes and swap values axes appropriately. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis1 = self._get_axis_number(axis1) - axis2 = self._get_axis_number(axis2) - if axis1 != axis2: - return self.transpose() - if copy: - return self.copy() - return self - - def swaplevel(self, i=-2, j=-1, axis=0): # noqa: PR01, RT01, D200 - """ - Swap levels `i` and `j` in a `MultiIndex`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - idx = self.index if axis == 0 else self.columns - return self.set_axis(idx.swaplevel(i, j), axis=axis) - - def tail(self, n: int = 5): - if n == 0: - return self.iloc[0:0] - return self.iloc[-n:] - - def take( - self, - indices: list | AnyArrayLike | slice, - axis: Axis = 0, - **kwargs, - ): - """ - Return the elements in the given *positional* indices along an axis. - """ - axis = self._get_axis_number(axis) - slice_obj = indices if axis == 0 else (slice(None), indices) - return self.iloc[slice_obj] - - def to_clipboard( - self, excel=True, sep=None, **kwargs - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Copy object to the system clipboard. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas("to_clipboard", excel=excel, sep=sep, **kwargs) - - def to_csv( - self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - lineterminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - errors: str = "strict", - storage_options: StorageOptions = None, - ): # pragma: no cover - from snowflake.snowpark.modin.core.execution.dispatching.factories.dispatcher import ( - FactoryDispatcher, - ) - - return FactoryDispatcher.to_csv( - self._query_compiler, - path_or_buf=path_or_buf, - sep=sep, - na_rep=na_rep, - float_format=float_format, - columns=columns, - header=header, - index=index, - index_label=index_label, - mode=mode, - encoding=encoding, - compression=compression, - quoting=quoting, - quotechar=quotechar, - lineterminator=lineterminator, - chunksize=chunksize, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, - decimal=decimal, - errors=errors, - storage_options=storage_options, - ) - - def to_excel( - self, - excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=no_default, - inf_rep="inf", - verbose=no_default, - freeze_panes=None, - storage_options: StorageOptions = None, - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Write object to an Excel sheet. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_excel", - excel_writer, - sheet_name=sheet_name, - na_rep=na_rep, - float_format=float_format, - columns=columns, - header=header, - index=index, - index_label=index_label, - startrow=startrow, - startcol=startcol, - engine=engine, - merge_cells=merge_cells, - inf_rep=inf_rep, - freeze_panes=freeze_panes, - storage_options=storage_options, - ) - - def to_hdf( - self, path_or_buf, key, format="table", **kwargs - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Write the contained data to an HDF5 file using HDFStore. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_hdf", path_or_buf, key, format=format, **kwargs - ) - - def to_json( - self, - path_or_buf=None, - orient=None, - date_format=None, - double_precision=10, - force_ascii=True, - date_unit="ms", - default_handler=None, - lines=False, - compression="infer", - index=True, - indent=None, - storage_options: StorageOptions = None, - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Convert the object to a JSON string. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_json", - path_or_buf, - orient=orient, - date_format=date_format, - double_precision=double_precision, - force_ascii=force_ascii, - date_unit=date_unit, - default_handler=default_handler, - lines=lines, - compression=compression, - index=index, - indent=indent, - storage_options=storage_options, - ) - - def to_latex( - self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - bold_rows=False, - column_format=None, - longtable=None, - escape=None, - encoding=None, - decimal=".", - multicolumn=None, - multicolumn_format=None, - multirow=None, - caption=None, - label=None, - position=None, - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Render object to a LaTeX tabular, longtable, or nested table. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_latex", - buf=buf, - columns=columns, - col_space=col_space, - header=header, - index=index, - na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, - index_names=index_names, - bold_rows=bold_rows, - column_format=column_format, - longtable=longtable, - escape=escape, - encoding=encoding, - decimal=decimal, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - position=position, - ) - - def to_markdown( - self, - buf=None, - mode: str = "wt", - index: bool = True, - storage_options: StorageOptions = None, - **kwargs, - ): # noqa: PR01, RT01, D200 - """ - Print `BasePandasDataset` in Markdown-friendly format. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_markdown", - buf=buf, - mode=mode, - index=index, - storage_options=storage_options, - **kwargs, - ) - - def to_pickle( - self, - path, - compression: CompressionOptions = "infer", - protocol: int = pkl.HIGHEST_PROTOCOL, - storage_options: StorageOptions = None, - ): # pragma: no cover # noqa: PR01, D200 - """ - Pickle (serialize) object to file. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - from snowflake.snowpark.modin.pandas import to_pickle - - to_pickle( - self, - path, - compression=compression, - protocol=protocol, - storage_options=storage_options, - ) - - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value: object = no_default, - **kwargs: Any, - ) -> np.ndarray: - """ - Convert the `BasePandasDataset` to a NumPy array or a Modin wrapper for NumPy array. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - if copy: - WarningMessage.ignored_argument( - operation="to_numpy", - argument="copy", - message="copy is ignored in Snowflake backend", - ) - return self._query_compiler.to_numpy( - dtype=dtype, - na_value=na_value, - **kwargs, - ) - - # TODO(williamma12): When this gets implemented, have the series one call this. - def to_period( - self, freq=None, axis=0, copy=True - ): # pragma: no cover # noqa: PR01, RT01, D200 - """ - Convert `BasePandasDataset` from DatetimeIndex to PeriodIndex. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas("to_period", freq=freq, axis=axis, copy=copy) - - def to_string( - self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - justify=None, - max_rows=None, - min_rows=None, - max_cols=None, - show_dimensions=False, - decimal=".", - line_width=None, - max_colwidth=None, - encoding=None, - ): # noqa: PR01, RT01, D200 - """ - Render a `BasePandasDataset` to a console-friendly tabular output. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_string", - buf=buf, - columns=columns, - col_space=col_space, - header=header, - index=index, - na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, - index_names=index_names, - justify=justify, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - line_width=line_width, - max_colwidth=max_colwidth, - encoding=encoding, - ) - - def to_sql( - self, - name, - con, - schema=None, - if_exists="fail", - index=True, - index_label=None, - chunksize=None, - dtype=None, - method=None, - ): # noqa: PR01, D200 - """ - Write records stored in a `BasePandasDataset` to a SQL database. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - new_query_compiler = self._query_compiler - # writing the index to the database by inserting it to the DF - if index: - if not index_label: - index_label = "index" - new_query_compiler = new_query_compiler.insert(0, index_label, self.index) - # so pandas._to_sql will not write the index to the database as well - index = False - - from modin.core.execution.dispatching.factories.dispatcher import ( - FactoryDispatcher, - ) - - FactoryDispatcher.to_sql( - new_query_compiler, - name=name, - con=con, - schema=schema, - if_exists=if_exists, - index=index, - index_label=index_label, - chunksize=chunksize, - dtype=dtype, - method=method, - ) - - # TODO(williamma12): When this gets implemented, have the series one call this. - def to_timestamp( - self, freq=None, how="start", axis=0, copy=True - ): # noqa: PR01, RT01, D200 - """ - Cast to DatetimeIndex of timestamps, at *beginning* of period. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas( - "to_timestamp", freq=freq, how=how, axis=axis, copy=copy - ) - - def to_xarray(self): # noqa: PR01, RT01, D200 - """ - Return an xarray object from the `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas("to_xarray") - - def truediv( - self, other, axis="columns", level=None, fill_value=None - ): # noqa: PR01, RT01, D200 - """ - Get floating division of `BasePandasDataset` and `other`, element-wise (binary operator `truediv`). - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op( - "truediv", other, axis=axis, level=level, fill_value=fill_value - ) - - div = divide = truediv - - def truncate( - self, before=None, after=None, axis=None, copy=True - ): # noqa: PR01, RT01, D200 - """ - Truncate a `BasePandasDataset` before and after some index value. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - axis = self._get_axis_number(axis) - if ( - not self.axes[axis].is_monotonic_increasing - and not self.axes[axis].is_monotonic_decreasing - ): - raise ValueError("truncate requires a sorted index") - s = slice(*self.axes[axis].slice_locs(before, after)) - slice_obj = s if axis == 0 else (slice(None), s) - return self.iloc[slice_obj] - - def transform(self, func, axis=0, *args, **kwargs): # noqa: PR01, RT01, D200 - """ - Call ``func`` on self producing a `BasePandasDataset` with the same axis shape as self. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - kwargs["is_transform"] = True - self._validate_function(func) - try: - result = self.agg(func, axis=axis, *args, **kwargs) - except TypeError: - raise - except Exception as err: - raise ValueError("Transform function failed") from err - try: - assert len(result) == len(self) - except Exception: - raise ValueError("transforms cannot produce aggregated results") - return result - - def tz_convert(self, tz, axis=0, level=None, copy=None): # noqa: PR01, RT01, D200 - """ - Convert tz-aware axis to target time zone. - """ - if copy is None: - copy = True - return self._create_or_update_from_compiler( - self._query_compiler.tz_convert( - tz, axis=self._get_axis_number(axis), level=level, copy=copy - ), - inplace=(not copy), - ) - - def tz_localize( - self, tz, axis=0, level=None, copy=None, ambiguous="raise", nonexistent="raise" - ): # noqa: PR01, RT01, D200 - """ - Localize tz-naive index of a `BasePandasDataset` to target time zone. - """ - if copy is None: - copy = True - return self._create_or_update_from_compiler( - self._query_compiler.tz_localize( - tz, - axis=self._get_axis_number(axis), - level=level, - copy=copy, - ambiguous=ambiguous, - nonexistent=nonexistent, - ), - inplace=(not copy), - ) - - def var( - self, - axis: Axis | None = None, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs: Any, - ): - """ - Return unbiased variance over requested axis. - """ - kwargs.update({"ddof": ddof}) - return self._agg_helper( - func="var", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def __abs__(self): - """ - Return a `BasePandasDataset` with absolute numeric value of each element. - - Returns - ------- - BasePandasDataset - Object containing the absolute value of each element. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.abs() - - def __and__(self, other): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("__and__", other, axis=0) - - def __rand__(self, other): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("__rand__", other, axis=0) - - def __array__(self, dtype=None): - """ - Return the values as a NumPy array. - - Parameters - ---------- - dtype : str or np.dtype, optional - The dtype of returned array. - - Returns - ------- - arr : np.ndarray - NumPy representation of Modin object. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - WarningMessage.single_warning( - "Calling __array__ on a modin object materializes all data into local memory.\n" - + "Since this can be called by 3rd party libraries silently, it can lead to \n" - + "unexpected delays or high memory usage. Use to_pandas() or to_numpy() to do \n" - + "this once explicitly.", - ) - arr = self.to_numpy(dtype) - return arr - - def __copy__(self, deep=True): - """ - Return the copy of the `BasePandasDataset`. - - Parameters - ---------- - deep : bool, default: True - Whether the copy should be deep or not. - - Returns - ------- - BasePandasDataset - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.copy(deep=deep) - - def __deepcopy__(self, memo=None): - """ - Return the deep copy of the `BasePandasDataset`. - - Parameters - ---------- - memo : Any, optional - Deprecated parameter. - - Returns - ------- - BasePandasDataset - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.copy(deep=True) - - def __eq__(self, other): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.eq(other) - - def __finalize__(self, other, method=None, **kwargs): - """ - Propagate metadata from `other` to `self`. - - Parameters - ---------- - other : BasePandasDataset - The object from which to get the attributes that we are going - to propagate. - method : str, optional - A passed method name providing context on where `__finalize__` - was called. - **kwargs : dict - Additional keywords arguments to be passed to `__finalize__`. - - Returns - ------- - BasePandasDataset - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._default_to_pandas("__finalize__", other, method=method, **kwargs) - - def __ge__(self, right): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.ge(right) - - def __getitem__(self, key): - """ - Retrieve dataset according to `key`. - - Parameters - ---------- - key : callable, scalar, slice, str or tuple - The global row index to retrieve data from. - - Returns - ------- - BasePandasDataset - Located dataset. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - key = apply_if_callable(key, self) - # If a slice is passed in, use .iloc[key]. - if isinstance(key, slice): - if (is_integer(key.start) or key.start is None) and ( - is_integer(key.stop) or key.stop is None - ): - return self.iloc[key] - else: - return self.loc[key] - - # If the object calling getitem is a Series, only use .loc[key] to filter index. - if isinstance(self, pd.Series): - return self.loc[key] - - # Sometimes the result of a callable is a DataFrame (e.g. df[df > 0]) - use where. - elif isinstance(key, pd.DataFrame): - return self.where(cond=key) - - # If the object is a boolean list-like object, use .loc[key] to filter index. - # The if statement is structured this way to avoid calling dtype and reduce query count. - if isinstance(key, pd.Series): - if key.dtype == bool: - return self.loc[key] - elif is_list_like(key): - if hasattr(key, "dtype"): - if key.dtype == bool: - return self.loc[key] - if (all(is_bool(k) for k in key)) and len(key) > 0: - return self.loc[key] - - # In all other cases, use .loc[:, key] to filter columns. - return self.loc[:, key] - - __hash__ = None - - def __gt__(self, right): - return self.gt(right) - - def __invert__(self): - """ - Apply bitwise inverse to each element of the `BasePandasDataset`. - - Returns - ------- - BasePandasDataset - New BasePandasDataset containing bitwise inverse to each value. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__(query_compiler=self._query_compiler.invert()) - - def __le__(self, right): - return self.le(right) - - def __len__(self) -> int: - """ - Return length of info axis. - - Returns - ------- - int - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._query_compiler.get_axis_len(axis=0) - - def __lt__(self, right): - return self.lt(right) - - def __matmul__(self, other): - """ - Compute the matrix multiplication between the `BasePandasDataset` and `other`. - - Parameters - ---------- - other : BasePandasDataset or array-like - The other object to compute the matrix product with. - - Returns - ------- - BasePandasDataset, np.ndarray or scalar - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.dot(other) - - def __ne__(self, other): - return self.ne(other) - - def __neg__(self): - """ - Change the sign for every value of self. - - Returns - ------- - BasePandasDataset - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.__constructor__( - query_compiler=self._query_compiler.unary_op("__neg__") - ) - - def __nonzero__(self): - """ - Evaluate `BasePandasDataset` as boolean object. - - Raises - ------ - ValueError - Always since truth value for self is ambiguous. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - raise ValueError( - f"The truth value of a {self.__class__.__name__} is ambiguous. " - + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - ) - - __bool__ = __nonzero__ - - def __or__(self, other): - return self._binary_op("__or__", other, axis=0) - - def __ror__(self, other): - return self._binary_op("__ror__", other, axis=0) - - def __sizeof__(self): - """ - Generate the total memory usage for an `BasePandasDataset`. - - Returns - ------- - int - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - - return self._default_to_pandas("__sizeof__") - - def __str__(self): # pragma: no cover - """ - Return str(self). - - Returns - ------- - str - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return repr(self) - - def __xor__(self, other): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("__xor__", other, axis=0) - - def __rxor__(self, other): - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self._binary_op("__rxor__", other, axis=0) - - @property - def size(self) -> int: - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return np.prod(self.shape) # type: ignore[return-value] - - @property - def values(self) -> np.ndarray: - """ - Return a NumPy representation of the `BasePandasDataset`. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - return self.to_numpy() - - def _repartition(self, axis: int | None = None): - """ - Repartitioning Modin objects to get ideal partitions inside. - - Allows to improve performance where the query compiler can't improve - yet by doing implicit repartitioning. - - Parameters - ---------- - axis : {0, 1, None}, optional - The axis along which the repartitioning occurs. - `None` is used for repartitioning along both axes. - - Returns - ------- - DataFrame or Series - The repartitioned dataframe or series, depending on the original type. - """ - # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - allowed_axis_values = (0, 1, None) - if axis not in allowed_axis_values: - raise ValueError( - f"Passed `axis` parameter: {axis}, but should be one of {allowed_axis_values}" - ) - return self.__constructor__( - query_compiler=self._query_compiler.repartition(axis=axis) - ) - - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): - """ - Apply the `ufunc` to the `BasePandasDataset`. - - Parameters - ---------- - ufunc : np.ufunc - The NumPy ufunc to apply. - method : str - The method to apply. - *inputs : tuple - The inputs to the ufunc. - **kwargs : dict - Additional keyword arguments. - - Returns - ------- - BasePandasDataset - The result of the ufunc applied to the `BasePandasDataset`. - """ - # Use pandas version of ufunc if it exists - if method != "__call__": - # Return sentinel value NotImplemented - return NotImplemented - from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import ( - numpy_to_pandas_universal_func_map, - ) - - if ufunc.__name__ in numpy_to_pandas_universal_func_map: - ufunc = numpy_to_pandas_universal_func_map[ufunc.__name__] - return ufunc(self, inputs[1:], kwargs) - # return the sentinel NotImplemented if we do not support this function - return NotImplemented - - def __array_function__( - self, func: callable, types: tuple, args: tuple, kwargs: dict - ): - """ - Apply the `func` to the `BasePandasDataset`. - - Parameters - ---------- - func : np.func - The NumPy func to apply. - types : tuple - The types of the args. - args : tuple - The args to the func. - kwargs : dict - Additional keyword arguments. - - Returns - ------- - BasePandasDataset - The result of the ufunc applied to the `BasePandasDataset`. - """ - from snowflake.snowpark.modin.plugin.utils.numpy_to_pandas import ( - numpy_to_pandas_func_map, - ) - - if func.__name__ in numpy_to_pandas_func_map: - return numpy_to_pandas_func_map[func.__name__](*args, **kwargs) - else: - # per NEP18 we raise NotImplementedError so that numpy can intercept - return NotImplemented # pragma: no cover From e1149ca90e853eb1bfbb575e3534d9b004135919 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 30 Aug 2024 16:09:16 -0700 Subject: [PATCH 4/7] [SNOW-1458134]: Add support for monotonic checks for Series and Index (#2186) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1458134 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Add support for Index and Series is_monotonic_increasing and is_monotonic_decreasing --- CHANGELOG.md | 2 + docs/source/modin/series.rst | 2 + .../modin/supported/index_supported.rst | 4 +- .../modin/supported/series_supported.rst | 4 +- .../compiler/snowflake_query_compiler.py | 128 +++++++++++++++--- .../modin/plugin/docstrings/series.py | 54 ++++++-- .../snowpark/modin/plugin/extensions/index.py | 28 +++- tests/integ/modin/index/test_monotonic.py | 97 +++++++++++++ tests/integ/modin/series/test_monotonic.py | 97 +++++++++++++ tests/integ/modin/test_unimplemented.py | 4 - 10 files changed, 372 insertions(+), 48 deletions(-) create mode 100644 tests/integ/modin/index/test_monotonic.py create mode 100644 tests/integ/modin/series/test_monotonic.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8aab4250764..473ce424248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -78,6 +78,8 @@ - Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`. - Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`. - Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`. +- Added support for `Series.is_monotonic_increasing` and `Series.is_monotonic_decreasing`. +- Added support for `Index.is_monotonic_increasing` and `Index.is_monotonic_decreasing`. #### Improvements diff --git a/docs/source/modin/series.rst b/docs/source/modin/series.rst index 507d6663f32..fbd936db2f9 100644 --- a/docs/source/modin/series.rst +++ b/docs/source/modin/series.rst @@ -26,6 +26,8 @@ Series Series.equals Series.empty Series.hasnans + Series.is_monotonic_increasing + Series.is_monotonic_decreasing Series.name Series.ndim Series.shape diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst index 9db80686454..0c413c201fb 100644 --- a/docs/source/modin/supported/index_supported.rst +++ b/docs/source/modin/supported/index_supported.rst @@ -20,9 +20,9 @@ Attributes +-----------------------------+---------------------------------+----------------------------------------------------+ | ``values`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``is_monotonic_increasing`` | N | | +| ``is_monotonic_increasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``is_monotonic_decreasing`` | N | | +| ``is_monotonic_decreasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``is_unique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 331be4d0298..618b88d5034 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -43,9 +43,9 @@ Attributes +-----------------------------+---------------------------------+----------------------------------------------------+ | ``index`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``is_monotonic_decreasing`` | N | | +| ``is_monotonic_decreasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``is_monotonic_increasing`` | N | | +| ``is_monotonic_increasing`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``is_unique`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index a803eb332e7..be994015eac 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2283,9 +2283,82 @@ def reindex( else: return self._reindex_axis_1(labels=labels, **kwargs) + def is_monotonic_decreasing(self) -> "SnowflakeQueryCompiler": + """ + Returns a QueryCompiler containing only a column that checks for monotonically + decreasing values in the first data column of this QueryCompiler. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with column to ascertain whether data is monotonically decreasing. + """ + return self._check_monotonic(increasing=False) + + def is_monotonic_increasing(self) -> "SnowflakeQueryCompiler": + """ + Returns a QueryCompiler containing only a column that checks for monotonically + increasing values in the first data column of this QueryCompiler. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with column to ascertain whether data is monotonically increasing. + """ + return self._check_monotonic(increasing=True) + + def _check_monotonic(self, increasing: bool) -> "SnowflakeQueryCompiler": + """ + Returns a QueryCompiler containing only a column that checks for monotonically + decreasing or increasing values (depending on `increasing`) in the first data column of this QueryCompiler. + + Parameters + ---------- + increasing: bool + Whether to check for monotonically increasing or decreasing values. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with column to ascertain whether data is monotonically decreasing/increasing. + """ + col_to_check = self._modin_frame.data_column_snowflake_quoted_identifiers[0] + ( + new_qc, + monotonic_increasing_snowflake_quoted_identifier, + monotonic_decreasing_snowflake_quoted_identifier, + ) = self._add_columns_for_monotonicity_checks( + col_to_check=col_to_check, + columns_to_add="increasing" if increasing else "decreasing", + ) + data_column_snowflake_quoted_identifiers = [] + if increasing: + data_column_snowflake_quoted_identifiers.append( + monotonic_increasing_snowflake_quoted_identifier + ) + else: + data_column_snowflake_quoted_identifiers.append( + monotonic_decreasing_snowflake_quoted_identifier + ) + new_modin_frame = new_qc._modin_frame + return SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_modin_frame.ordered_dataframe.limit( + n=1, sort=False + ), + data_column_pandas_index_names=new_modin_frame.data_column_pandas_index_names, + data_column_pandas_labels=["monotonic_column"], + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + index_column_pandas_labels=new_modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_modin_frame.index_column_snowflake_quoted_identifiers, + data_column_types=None, + index_column_types=None, + ) + ) + def _add_columns_for_monotonicity_checks( - self, col_to_check: str - ) -> tuple["SnowflakeQueryCompiler", str, str]: + self, col_to_check: str, columns_to_add: Optional[str] = None + ) -> tuple["SnowflakeQueryCompiler", Optional[str], Optional[str]]: """ Adds columns that check for monotonicity (increasing or decreasing) in the specified column. @@ -2294,6 +2367,8 @@ def _add_columns_for_monotonicity_checks( ---------- col_to_check : str The Snowflake quoted identifier for the column whose monotonicity to check. + columns_to_add : str, optional + Whether or not to add all columns, and if not, which columns to add. Returns ------- @@ -2304,9 +2379,16 @@ def _add_columns_for_monotonicity_checks( """ self._raise_not_implemented_error_for_timedelta() + assert columns_to_add in [ + None, + "increasing", + "decreasing", + ], "Invalid value passed to function" modin_frame = self._modin_frame modin_frame = modin_frame.ensure_row_position_column() row_position_column = modin_frame.row_position_snowflake_quoted_identifier + monotonic_decreasing_snowflake_quoted_id = None + monotonic_increasing_snowflake_quoted_id = None modin_frame = modin_frame.append_column( "_index_lag_col", lag(col_to_check).over(Window.order_by(row_position_column)), @@ -2314,26 +2396,28 @@ def _add_columns_for_monotonicity_checks( lag_col_snowflake_quoted_id = ( modin_frame.data_column_snowflake_quoted_identifiers[-1] ) - modin_frame = modin_frame.append_column( - "_is_monotonic_decreasing", - coalesce( - min_(col(col_to_check) < col(lag_col_snowflake_quoted_id)).over(), - pandas_lit(False), - ), - ) - monotonic_decreasing_snowflake_quoted_id = ( - modin_frame.data_column_snowflake_quoted_identifiers[-1] - ) - modin_frame = modin_frame.append_column( - "_is_monotonic_increasing", - coalesce( - min_(col(col_to_check) > col(lag_col_snowflake_quoted_id)).over(), - pandas_lit(False), - ), - ) - monotonic_increasing_snowflake_quoted_id = ( - modin_frame.data_column_snowflake_quoted_identifiers[-1] - ) + if columns_to_add in [None, "decreasing"]: + modin_frame = modin_frame.append_column( + "_is_monotonic_decreasing", + coalesce( + min_(col(col_to_check) <= col(lag_col_snowflake_quoted_id)).over(), + pandas_lit(False), + ), + ) + monotonic_decreasing_snowflake_quoted_id = ( + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ) + if columns_to_add in [None, "increasing"]: + modin_frame = modin_frame.append_column( + "_is_monotonic_increasing", + coalesce( + min_(col(col_to_check) >= col(lag_col_snowflake_quoted_id)).over(), + pandas_lit(False), + ), + ) + monotonic_increasing_snowflake_quoted_id = ( + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ) data_column_pandas_labels = modin_frame.data_column_pandas_labels data_column_snowflake_quoted_identifiers = ( modin_frame.data_column_snowflake_quoted_identifiers diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 4878c82635a..a8ab6a60c77 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -3661,6 +3661,48 @@ def hasnans(): Return True if there are any NaNs. """ + @property + def is_monotonic_decreasing(): + """ + Return boolean if values in the object are monotonically decreasing. + + Returns + ------- + bool + Whether or not the Series is monotonically decreasing. + + Examples + -------- + >>> s = pd.Series([3, 2, 2, 1]) + >>> s.is_monotonic_decreasing + True + + >>> s = pd.Series([1, 2, 3]) + >>> s.is_monotonic_decreasing + False + """ + + @property + def is_monotonic_increasing(): + """ + Return boolean if values in the object are monotonically increasing. + + Returns + ------- + bool + Whether or not the Series is monotonically increasing. + + Examples + -------- + >>> s = pd.Series([1, 2, 2]) + >>> s.is_monotonic_increasing + True + + >>> s = pd.Series([3, 2, 1]) + >>> s.is_monotonic_increasing + False + """ + def isna(): """ Detect missing values. @@ -3721,18 +3763,6 @@ def isnull(): dtype: bool """ - @property - def is_monotonic_increasing(): - """ - Return True if values in the Series are monotonic_increasing. - """ - - @property - def is_monotonic_decreasing(): - """ - Return True if values in the Series are monotonic_decreasing. - """ - @property def is_unique(): """ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index 808489b8917..2682fd2b985 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -398,8 +398,7 @@ def values(self) -> ArrayLike: return self.to_pandas().values @property - @index_not_implemented() - def is_monotonic_increasing(self) -> None: + def is_monotonic_increasing(self) -> bool: """ Return a boolean if the values are equal or increasing. @@ -411,12 +410,20 @@ def is_monotonic_increasing(self) -> None: See Also -------- Index.is_monotonic_decreasing : Check if the values are equal or decreasing + + Examples + -------- + >>> pd.Index([1, 2, 3]).is_monotonic_increasing + True + >>> pd.Index([1, 2, 2]).is_monotonic_increasing + True + >>> pd.Index([1, 3, 2]).is_monotonic_increasing + False """ - # TODO: SNOW-1458134 implement is_monotonic_increasing + return self.to_series().is_monotonic_increasing @property - @index_not_implemented() - def is_monotonic_decreasing(self) -> None: + def is_monotonic_decreasing(self) -> bool: """ Return a boolean if the values are equal or decreasing. @@ -428,8 +435,17 @@ def is_monotonic_decreasing(self) -> None: See Also -------- Index.is_monotonic_increasing : Check if the values are equal or increasing + + Examples + -------- + >>> pd.Index([3, 2, 1]).is_monotonic_decreasing + True + >>> pd.Index([3, 2, 2]).is_monotonic_decreasing + True + >>> pd.Index([3, 1, 2]).is_monotonic_decreasing + False """ - # TODO: SNOW-1458134 implement is_monotonic_decreasing + return self.to_series().is_monotonic_decreasing @property def is_unique(self) -> bool: diff --git a/tests/integ/modin/index/test_monotonic.py b/tests/integ/modin/index/test_monotonic.py new file mode 100644 index 00000000000..5a15e4eb021 --- /dev/null +++ b/tests/integ/modin/index/test_monotonic.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker + + +@pytest.mark.parametrize( + "values", [[1, 2, 3], [3, 2, 1], [1, 3, 2], [1, 2, 2], [1, np.NaN, 3]] +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_numbers(values): + assert ( + pd.Index(values).is_monotonic_increasing + == native_pd.Index(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", [[3, 2, 1], [1, 2, 3], [3, 1, 2], [2, 2, 1], [3, np.NaN, 1]] +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_numbers(values): + assert ( + pd.Index(values).is_monotonic_decreasing + == native_pd.Index(values).is_monotonic_decreasing + ) + + +@pytest.mark.parametrize( + "values", [["a", "b", "c"], ["c", "b", "a"], ["a", "c", "b"], ["ca", "cab", "cat"]] +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_str(values): + assert ( + pd.Index(values).is_monotonic_increasing + == native_pd.Index(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", [["c", "b", "a"], ["a", "b", "c"], ["c", "a", "b"], ["cat", "cab", "ca"]] +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_str(values): + assert ( + pd.Index(values).is_monotonic_decreasing + == native_pd.Index(values).is_monotonic_decreasing + ) + + +@pytest.mark.parametrize( + "values", + [ + native_pd.date_range(start="1/1/2018", end="1/03/2018").values, + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[::-1], + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[[0, 2, 1]], + [ + native_pd.Timestamp("2018-01-01 00:00:00"), + native_pd.NaT, + native_pd.Timestamp("2018-01-01 01:20:00"), + ], + ], +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_dates(values): + assert ( + pd.DatetimeIndex(values).is_monotonic_increasing + == native_pd.DatetimeIndex(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", + [ + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[::-1], + native_pd.date_range(start="1/1/2018", end="1/03/2018").values, + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[[2, 0, 1]], + [ + native_pd.Timestamp("2018-01-01 01:20:00"), + native_pd.NaT, + native_pd.Timestamp("2018-01-01 00:00:00"), + ], + ], +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_dates(values): + assert ( + pd.DatetimeIndex(values).is_monotonic_decreasing + == native_pd.DatetimeIndex(values).is_monotonic_decreasing + ) diff --git a/tests/integ/modin/series/test_monotonic.py b/tests/integ/modin/series/test_monotonic.py new file mode 100644 index 00000000000..8726b9d9bd8 --- /dev/null +++ b/tests/integ/modin/series/test_monotonic.py @@ -0,0 +1,97 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker + + +@pytest.mark.parametrize( + "values", [[1, 2, 3], [3, 2, 1], [1, 3, 2], [1, 2, 2], [1, np.NaN, 3]] +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_numbers(values): + assert ( + pd.Series(values).is_monotonic_increasing + == native_pd.Series(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", [[3, 2, 1], [1, 2, 3], [3, 1, 2], [2, 2, 1], [3, np.NaN, 1]] +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_numbers(values): + assert ( + pd.Series(values).is_monotonic_decreasing + == native_pd.Series(values).is_monotonic_decreasing + ) + + +@pytest.mark.parametrize( + "values", [["a", "b", "c"], ["c", "b", "a"], ["a", "c", "b"], ["ca", "cab", "cat"]] +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_str(values): + assert ( + pd.Series(values).is_monotonic_increasing + == native_pd.Series(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", [["c", "b", "a"], ["a", "b", "c"], ["c", "a", "b"], ["cat", "cab", "ca"]] +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_str(values): + assert ( + pd.Series(values).is_monotonic_decreasing + == native_pd.Series(values).is_monotonic_decreasing + ) + + +@pytest.mark.parametrize( + "values", + [ + native_pd.date_range(start="1/1/2018", end="1/03/2018").values, + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[::-1], + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[[0, 2, 1]], + [ + native_pd.Timestamp("2018-01-01 00:00:00"), + native_pd.NaT, + native_pd.Timestamp("2018-01-01 01:20:00"), + ], + ], +) +@sql_count_checker(query_count=1) +def test_monotonic_increasing_dates(values): + assert ( + pd.Series(values).is_monotonic_increasing + == native_pd.Series(values).is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + "values", + [ + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[::-1], + native_pd.date_range(start="1/1/2018", end="1/03/2018").values, + native_pd.date_range(start="1/1/2018", end="1/03/2018").values[[2, 0, 1]], + [ + native_pd.Timestamp("2018-01-01 01:20:00"), + native_pd.NaT, + native_pd.Timestamp("2018-01-01 00:00:00"), + ], + ], +) +@sql_count_checker(query_count=1) +def test_monotonic_decreasing_dates(values): + assert ( + pd.Series(values).is_monotonic_decreasing + == native_pd.Series(values).is_monotonic_decreasing + ) diff --git a/tests/integ/modin/test_unimplemented.py b/tests/integ/modin/test_unimplemented.py index 8b1d6ef182f..deb5bce6af1 100644 --- a/tests/integ/modin/test_unimplemented.py +++ b/tests/integ/modin/test_unimplemented.py @@ -81,8 +81,6 @@ def helper(df): # unsupported methods that can only be applied on series # This set triggers SeriesDefault.register UNSUPPORTED_SERIES_METHODS = [ - (lambda se: se.is_monotonic_increasing, "property fget:is_monotonic_increasing"), - (lambda se: se.is_monotonic_decreasing, "property fget:is_monotonic_decreasing"), (lambda df: df.transform(lambda x: x + 1), "transform"), ] @@ -180,8 +178,6 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None: # unsupported methods for Index UNSUPPORTED_INDEX_METHODS = [ - lambda idx: idx.is_monotonic_increasing(), - lambda idx: idx.is_monotonic_decreasing(), lambda idx: idx.nbytes(), lambda idx: idx.memory_usage(), lambda idx: idx.delete(), From 3c1db0722e233d951860ce44bbb6dae18f8e9852 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 30 Aug 2024 17:15:03 -0700 Subject: [PATCH 5/7] [SNOW-1502893]: Add support for `pd.crosstab` (#1837) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1502893 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Add support for pd.crosstab. --- CHANGELOG.md | 1 + docs/source/modin/general_functions.rst | 1 + .../modin/supported/general_supported.rst | 5 +- .../snowpark/modin/pandas/general.py | 330 ++++++++- .../compiler/snowflake_query_compiler.py | 32 +- tests/integ/modin/crosstab/conftest.py | 91 +++ tests/integ/modin/crosstab/test_crosstab.py | 639 ++++++++++++++++++ tests/unit/modin/test_unsupported.py | 1 - 8 files changed, 1071 insertions(+), 29 deletions(-) create mode 100644 tests/integ/modin/crosstab/conftest.py create mode 100644 tests/integ/modin/crosstab/test_crosstab.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 473ce424248..005aaa3a8dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,7 @@ - Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`. - Added support for `Series.is_monotonic_increasing` and `Series.is_monotonic_decreasing`. - Added support for `Index.is_monotonic_increasing` and `Index.is_monotonic_decreasing`. +- Added support for `pd.crosstab`. #### Improvements diff --git a/docs/source/modin/general_functions.rst b/docs/source/modin/general_functions.rst index 803a901ac15..858bc54003e 100644 --- a/docs/source/modin/general_functions.rst +++ b/docs/source/modin/general_functions.rst @@ -11,6 +11,7 @@ General functions :toctree: pandas_api/ melt + crosstab pivot pivot_table cut diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index b055ed9dc6d..a12951d00f6 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -18,7 +18,10 @@ Data manipulations | ``concat`` | P | ``levels`` is not supported, | | | | | ``copy`` is ignored | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``crosstab`` | N | | | +| ``crosstab`` | P | | ``N`` if ``aggfunc`` is not one of | +| | | | "count", "mean", "min", "max", or "sum", or | +| | | | margins is True, normalize is "all" or True, | +| | | | and values is passed. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``cut`` | P | ``retbins``, ``labels`` | ``N`` if ``retbins=True``or ``labels!=False`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 07f0617d612..df19e9eac91 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -22,7 +22,7 @@ """Implement pandas general API.""" from __future__ import annotations -from collections.abc import Hashable, Iterable, Mapping, Sequence +from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence from datetime import date, datetime, timedelta, tzinfo from logging import getLogger from typing import TYPE_CHECKING, Any, Literal, Union @@ -49,7 +49,7 @@ _infer_tz_from_endpoints, _maybe_normalize_endpoints, ) -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import is_list_like, is_nested_list_like from pandas.core.dtypes.inference import is_array_like from pandas.core.tools.datetimes import ( ArrayConvertible, @@ -1982,8 +1982,6 @@ def melt( @snowpark_pandas_telemetry_standalone_function_decorator -@pandas_module_level_function_not_implemented() -@_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab") def crosstab( index, columns, @@ -1998,21 +1996,319 @@ def crosstab( ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compute a simple cross tabulation of two (or more) factors. + + By default, computes a frequency table of the factors unless an array + of values and an aggregation function are passed. + + Parameters + ---------- + index : array-like, Series, or list of arrays/Series + Values to group by in the rows. + columns : array-like, Series, or list of arrays/Series + Values to group by in the columns. + values : array-like, optional + Array of values to aggregate according to the factors. + Requires aggfunc be specified. + rownames : sequence, default None + If passed, must match number of row arrays passed. + colnames : sequence, default None + If passed, must match number of column arrays passed. + aggfunc : function, optional + If specified, requires values be specified as well. + margins : bool, default False + Add row/column margins (subtotals). + margins_name : str, default 'All' + Name of the row/column that will contain the totals when margins is True. + dropna : bool, default True + Do not include columns whose entries are all NaN. + + normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False + Normalize by dividing all values by the sum of values. + + * If passed 'all' or True, will normalize over all values. + * If passed 'index' will normalize over each row. + * If passed 'columns' will normalize over each column. + * If margins is True, will also normalize margin values. + + Returns + ------- + Snowpark pandas :class:`~snowflake.snowpark.modin.pandas.DataFrame` + Cross tabulation of the data. + + Notes + ----- + + Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum", or + margins is True, normalize is True or all, and values is passed. + + Examples + -------- + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) # doctest: +NORMALIZE_WHITESPACE + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 """ - # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py - pandas_crosstab = pandas.crosstab( - index, - columns, - values, - rownames, - colnames, - aggfunc, - margins, - margins_name, - dropna, - normalize, + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + + if not is_nested_list_like(index): + index = [index] + if not is_nested_list_like(columns): + columns = [columns] + + if ( + values is not None + and margins is True + and (normalize is True or normalize == "all") + ): + raise NotImplementedError( + 'Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.' + ) + + user_passed_rownames = rownames is not None + user_passed_colnames = colnames is not None + + from pandas.core.reshape.pivot import _build_names_mapper, _get_names + + def _get_names_wrapper(list_of_objs, names, prefix): + """ + Helper method to expand DataFrame objects containing + multiple columns into Series, since `_get_names` expects + one column per entry. + """ + expanded_list_of_objs = [] + for obj in list_of_objs: + if isinstance(obj, DataFrame): + for col in obj.columns: + expanded_list_of_objs.append(obj[col]) + else: + expanded_list_of_objs.append(obj) + return _get_names(expanded_list_of_objs, names, prefix) + + rownames = _get_names_wrapper(index, rownames, prefix="row") + colnames = _get_names_wrapper(columns, colnames, prefix="col") + + ( + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, + ) = _build_names_mapper(rownames, colnames) + + pass_objs = [x for x in index + columns if isinstance(x, (Series, DataFrame))] + row_idx_names = None + col_idx_names = None + if pass_objs: + # If we have any Snowpark pandas objects in the index or columns, then we + # need to find the intersection of their indices, and only pick rows from + # the objects that have indices in the intersection of their indices. + # After we do that, we then need to append the non Snowpark pandas objects + # using the intersection of indices as the final index for the DataFrame object. + # First, we separate the objects into Snowpark pandas objects, and non-Snowpark + # pandas objects (while renaming them so that they have unique names). + rownames_idx = 0 + row_idx_names = [] + dfs = [] + arrays = [] + array_lengths = [] + for obj in index: + if isinstance(obj, Series): + row_idx_names.append(obj.name) + df = pd.DataFrame(obj) + df.columns = [unique_rownames[rownames_idx]] + rownames_idx += 1 + dfs.append(df) + elif isinstance(obj, DataFrame): + row_idx_names.extend(obj.columns) + obj.columns = unique_rownames[ + rownames_idx : rownames_idx + len(obj.columns) + ] + rownames_idx += len(obj.columns) + dfs.append(obj) + else: + row_idx_names.append(None) + array_lengths.append(len(obj)) + df = pd.DataFrame(obj) + df.columns = unique_rownames[ + rownames_idx : rownames_idx + len(df.columns) + ] + rownames_idx += len(df.columns) + arrays.append(df) + + colnames_idx = 0 + col_idx_names = [] + for obj in columns: + if isinstance(obj, Series): + col_idx_names.append(obj.name) + df = pd.DataFrame(obj) + df.columns = [unique_colnames[colnames_idx]] + colnames_idx += 1 + dfs.append(df) + elif isinstance(obj, DataFrame): + col_idx_names.extend(obj.columns) + obj.columns = unique_colnames[ + colnames_idx : colnames_idx + len(obj.columns) + ] + colnames_idx += len(obj.columns) + dfs.append(obj) + else: + col_idx_names.append(None) + array_lengths.append(len(obj)) + df = pd.DataFrame(obj) + df.columns = unique_colnames[ + colnames_idx : colnames_idx + len(df.columns) + ] + colnames_idx += len(df.columns) + arrays.append(df) + + if len(set(array_lengths)) > 1: + raise ValueError("All arrays must be of the same length") + + # Now, we have two lists - a list of Snowpark pandas objects, and a list of objects + # that were not passed in as Snowpark pandas objects, but that we have converted + # to Snowpark pandas objects to give them column names. We can perform inner joins + # on the dfs list to get a DataFrame with the final index (that is only an intersection + # of indices.) + df = dfs[0] + for right in dfs[1:]: + df = df.merge(right, left_index=True, right_index=True) + if len(arrays) > 0: + index = df.index + right_df = pd.concat(arrays, axis=1) + # Increases query count by 1, but necessary for error checking. + index_length = len(df) + if index_length != array_lengths[0]: + raise ValueError( + f"Length mismatch: Expected {array_lengths[0]} rows, received array of length {index_length}" + ) + right_df.index = index + df = df.merge(right_df, left_index=True, right_index=True) + else: + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } + df = DataFrame(data) + + if values is None: + df["__dummy__"] = 0 + kwargs = {"aggfunc": "count"} + else: + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + table = df.pivot_table( + "__dummy__", + index=unique_rownames, + columns=unique_colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + **kwargs, # type: ignore[arg-type] ) - return DataFrame(pandas_crosstab) + + if row_idx_names is not None and not user_passed_rownames: + table.index = table.index.set_names(row_idx_names) + + if col_idx_names is not None and not user_passed_colnames: + table.columns = table.columns.set_names(col_idx_names) + + if aggfunc is None: + # If no aggfunc is provided, we are computing frequencies. Since we use + # pivot_table above, pairs that are not observed will get a NaN value, + # so we need to fill all NaN values with 0. + table = table.fillna(0) + + # We must explicitly check that the value of normalize is not False here, + # as a valid value of normalize is `0` (for normalizing index). + if normalize is not False: + if normalize not in [0, 1, "index", "columns", "all", True]: + raise ValueError("Not a valid normalize argument") + if normalize is True: + normalize = "all" + normalize = {0: "index", 1: "columns"}.get(normalize, normalize) + + # Actual Normalizations + normalizers: dict[bool | str, Callable] = { + "all": lambda x: x / x.sum(axis=0).sum(), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis="index"), + } + + if margins is False: + + f = normalizers[normalize] + names = table.columns.names + table = f(table) + table.columns.names = names + table = table.fillna(0) + else: + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + column_margin = table.iloc[:-1, -1] + + if normalize == "columns": + # keep the core table + table = table.iloc[:-1, :-1] + + # Normalize core + f = normalizers[normalize] + table = f(table) + table = table.fillna(0) + # Fix Margins + column_margin = column_margin / column_margin.sum() + table = pd.concat([table, column_margin], axis=1) + table = table.fillna(0) + table.columns = table_columns + + elif normalize == "index": + table = table.iloc[:, :-1] + + # Normalize core + f = normalizers[normalize] + table = f(table) + table = table.fillna(0).reindex(index=table_index) + + elif normalize == "all": + # Normalize core + f = normalizers[normalize] + + # When we perform the normalization function, we take the sum over + # the rows, and divide every value by the sum. Since margins is included + # though, the result of the sum is actually 2 * the sum of the original + # values (since the margin itself is the sum of the original values), + # so we need to multiply by 2 here to account for that. + # The alternative would be to apply normalization to the main table + # and the index margins separately, but that would require additional joins + # to get the final table, which we want to avoid. + table = f(table.iloc[:, :-1]) * 2.0 + + column_margin = column_margin / column_margin.sum() + table = pd.concat([table, column_margin], axis=1) + table.iloc[-1, -1] = 1 + + table = table.fillna(0) + table.index = table_index + table.columns = table_columns + + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) + + return table # Adding docstring since pandas docs don't have web section for this function. diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index be994015eac..848c5e438b3 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -197,7 +197,10 @@ compute_bin_indices, preprocess_bins_for_cut, ) -from snowflake.snowpark.modin.plugin._internal.frame import InternalFrame +from snowflake.snowpark.modin.plugin._internal.frame import ( + InternalFrame, + LabelIdentifierPair, +) from snowflake.snowpark.modin.plugin._internal.groupby_utils import ( GROUPBY_AGG_PRESERVES_SNOWPARK_PANDAS_TYPE, GROUPBY_AGG_WITH_NONE_SNOWPARK_PANDAS_TYPES, @@ -5698,11 +5701,14 @@ def agg( ) for agg_arg in agg_args } + pandas_labels = list(agg_col_map.keys()) + if self.is_multiindex(axis=1): + pandas_labels = [ + (label,) * len(self.columns.names) for label in pandas_labels + ] single_agg_func_query_compilers.append( SnowflakeQueryCompiler( - frame.project_columns( - list(agg_col_map.keys()), list(agg_col_map.values()) - ) + frame.project_columns(pandas_labels, list(agg_col_map.values())) ) ) else: # axis == 0 @@ -14138,7 +14144,6 @@ def create_lazy_type_functions( assert len(right_result_data_identifiers) == 1, "other must be a Series" right = right_result_data_identifiers[0] right_datatype = right_datatypes[0] - # now replace in result frame identifiers with binary op result replace_mapping = {} snowpark_pandas_types = [] @@ -14160,10 +14165,19 @@ def create_lazy_type_functions( identifiers_to_keep = set( new_frame.index_column_snowflake_quoted_identifiers ) | set(update_result.old_id_to_new_id_mappings.values()) + self_is_column_mi = len(self._modin_frame.data_column_pandas_index_names) label_to_snowflake_quoted_identifier = [] snowflake_quoted_identifier_to_snowpark_pandas_type = {} for pair in new_frame.label_to_snowflake_quoted_identifier: if pair.snowflake_quoted_identifier in identifiers_to_keep: + if ( + self_is_column_mi + and isinstance(pair.label, tuple) + and isinstance(pair.label[0], tuple) + ): + pair = LabelIdentifierPair( + pair.label[0], pair.snowflake_quoted_identifier + ) label_to_snowflake_quoted_identifier.append(pair) snowflake_quoted_identifier_to_snowpark_pandas_type[ pair.snowflake_quoted_identifier @@ -14177,7 +14191,7 @@ def create_lazy_type_functions( label_to_snowflake_quoted_identifier ), num_index_columns=new_frame.num_index_columns, - data_column_index_names=new_frame.data_column_index_names, + data_column_index_names=self._modin_frame.data_column_index_names, snowflake_quoted_identifier_to_snowpark_pandas_type=snowflake_quoted_identifier_to_snowpark_pandas_type, ) @@ -14588,9 +14602,7 @@ def infer_sorted_column_labels( new_frame = InternalFrame.create( ordered_dataframe=expanded_ordered_frame, data_column_pandas_labels=sorted_column_labels, - data_column_pandas_index_names=[ - None - ], # operation removes column index name always. + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers + new_identifiers, index_column_pandas_labels=index_column_pandas_labels, @@ -14637,7 +14649,7 @@ def infer_sorted_column_labels( new_frame = InternalFrame.create( ordered_dataframe=expanded_ordered_frame, data_column_pandas_labels=expanded_data_column_pandas_labels, - data_column_pandas_index_names=[None], # operation removes names + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, data_column_snowflake_quoted_identifiers=expanded_data_column_snowflake_quoted_identifiers, index_column_pandas_labels=index_column_pandas_labels, index_column_snowflake_quoted_identifiers=frame.index_column_snowflake_quoted_identifiers, diff --git a/tests/integ/modin/crosstab/conftest.py b/tests/integ/modin/crosstab/conftest.py new file mode 100644 index 00000000000..6203419321d --- /dev/null +++ b/tests/integ/modin/crosstab/conftest.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 + + +@pytest.fixture(scope="function") +def a(): + return np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def b(): + return np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def c(): + return np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def basic_crosstab_dfs(): + df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + return df, pd.DataFrame(df) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py new file mode 100644 index 00000000000..276650519d9 --- /dev/null +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -0,0 +1,639 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import re + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result + + +@pytest.mark.parametrize("dropna", [True, False]) +class TestCrosstab: + def test_basic_crosstab_with_numpy_arrays(self, dropna, a, b, c): + query_count = 1 + join_count = 0 if dropna else 1 + with SqlCounter(query_count=query_count, join_count=join_count): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ), + ) + + def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b, c): + a = a[:-1] + b = b[:-2] + c = c[:-3] + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ), + assert_exception_equal=True, + expect_exception=True, + expect_exception_match="All arrays must be of the same length", + expect_exception_type=ValueError, + ) + + # In these tests, `overlap` refers to the intersection of the indices + # of the Series objects being passed in to crosstab. crosstab takes + # only the intersection of the index objects of all Series when determining + # the final DataFrame to pass into pivot_table, so here, we are testing + # that we follow that behavior. + def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c): + # In this case, all indexes are identical - hence "full" overlap. + query_count = 2 + join_count = 5 if dropna else 10 + + def eval_func(lib): + if lib is pd: + return lib.crosstab( + a, + [lib.Series(b), lib.Series(c)], + rownames=["a"], + colnames=["b", "c"], + dropna=dropna, + ) + else: + return lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + eval_snowpark_pandas_result(pd, native_pd, eval_func) + + def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c): + # In this case, some values are shared across indexes (non-zero intersection), + # hence "some" overlap. + # When a mix of Series and non-Series objects are passed in, the non-Series + # objects are expected to have the same length as the intersection of the indexes + # of the Series objects. This test case passes because we pass in arrays that + # are the length of the intersection rather than the length of each of the Series. + query_count = 2 + join_count = 5 if dropna else 10 + b = native_pd.Series( + b, + index=list(range(len(a))), + ) + c = native_pd.Series( + c, + index=-1 * np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)). In + # this test, we truncate the numpy column so that the lengths are correct. + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna, a, b, c): + # Same as above - the intersection of the indexes of the Series objects + # is non-zero, but the indexes are not identical - hence "some" overlap. + # When a mix of Series and non-Series objects are passed in, the non-Series + # objects are expected to have the same length as the intersection of the indexes + # of the Series objects. This test case errors because we pass in arrays that + # are the length of the Series, rather than the length of the intersection of + # the indexes of the Series. + b = native_pd.Series( + b, + index=list(range(len(a))), + ) + c = native_pd.Series( + c, + index=-1 * np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 1" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna, a, b, c): + # In this case, no values are shared across the indexes - the intersection is an + # empty set - hence "no" overlap. We error here for the same reason as above - the + # arrays passed in should also be empty, but are non-empty. + b = native_pd.Series( + b, + index=list(range(len(a))), + ) + c = native_pd.Series( + c, + index=-1 - np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 0" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) + + def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns( + self, dropna, a, b, c + ): + query_count = 4 + join_count = 1 if dropna else 3 + a = native_pd.Series( + a, + dtype=object, + ) + b = native_pd.DataFrame( + { + "0": b, + "1": c, + } + ) + # pandas expects only Series objects, or DataFrames that have only a single column, while + # we support accepting DataFrames with multiple columns. + with pytest.raises( + AssertionError, match="arrays and names must have the same length" + ): + native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna) + + def eval_func(args_list): + a, b = args_list + if isinstance(a, native_pd.Series): + return native_pd.crosstab( + a, + [b[c] for c in b.columns], + rownames=["a"], + colnames=["b", "c"], + dropna=dropna, + ) + else: + return pd.crosstab( + a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b] + snow_args = [pd.Series(a), pd.DataFrame(b)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + + def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index( + self, dropna, a, b, c + ): + query_count = 6 + join_count = 5 if dropna else 17 + a = native_pd.Series( + a, + dtype=object, + ) + b = native_pd.DataFrame( + { + "0": b, + "1": c, + } + ) + # pandas expects only Series objects, or DataFrames that have only a single column, while + # we support accepting DataFrames with multiple columns. + with pytest.raises( + AssertionError, match="arrays and names must have the same length" + ): + native_pd.crosstab(b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna) + + def eval_func(args_list): + a, b = args_list + if isinstance(a, native_pd.Series): + return native_pd.crosstab( + [b[c] for c in b.columns], + a, + rownames=["a", "b"], + colnames=["c"], + dropna=dropna, + ) + else: + return pd.crosstab( + b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b] + snow_args = [pd.Series(a), pd.DataFrame(b)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + + def test_margins(self, dropna, a, b, c): + query_count = 1 + join_count = 1 if dropna else 2 + union_count = 1 + + with SqlCounter( + query_count=query_count, join_count=join_count, union_count=union_count + ): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + dropna=dropna, + ), + ) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + def test_normalize(self, dropna, normalize, a, b, c): + query_count = 1 if normalize in (0, "index") else 2 + join_count = 3 if normalize in (0, "index") else 2 + if dropna: + join_count -= 2 + + with SqlCounter(query_count=query_count, join_count=join_count): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + dropna=dropna, + ), + ) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + def test_normalize_and_margins(self, dropna, normalize, a, b, c): + counts = { + "columns": [3, 5 if dropna else 9, 4], + "index": [1, 5 if dropna else 8, 3], + "all": [3, 12 if dropna else 19, 7], + } + counts[0] = counts["index"] + counts[1] = counts["columns"] + + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + union_count=sql_counts[2], + ): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + dropna=dropna, + ), + ) + + @pytest.mark.parametrize("normalize", [0, 1, "index", "columns"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c): + counts = { + "columns": [3, 29 if dropna else 41, 4], + "index": [1, 23 if dropna else 32, 3], + "all": [3, 54 if dropna else 75, 7], + } + counts[0] = counts["index"] + counts[1] = counts["columns"] + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + if aggfunc == "sum": + # When normalizing the data, we apply the normalization function to the + # entire table (including margins), which requires us to multiply by 2 + # (since the function takes the sum over the rows, and the margins row is + # itself the sum over the rows, causing the sum over all rows to be equal + # to 2 * the sum over the input rows). This hack allows us to save on joins + # but results in slight precision issues. + df = df.round(decimals=6) + return df + + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + union_count=sql_counts[2], + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_margins_and_values(self, dropna, aggfunc, a, b, c): + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + return df + + with SqlCounter( + query_count=1, + join_count=7 if dropna else 10, + union_count=1, + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c): + counts = { + "columns": [2, 4 if dropna else 10], + "index": [1, 5 if dropna else 11], + "all": [2, 4 if dropna else 10], + } + counts[0] = counts["index"] + counts[1] = counts["columns"] + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + dropna=dropna, + aggfunc=aggfunc, + ) + if aggfunc in ["sum", "max"]: + # When normalizing the data, we apply the normalization function to the + # entire table (including margins), which requires us to multiply by 2 + # (since the function takes the sum over the rows, and the margins row is + # itself the sum over the rows, causing the sum over all rows to be equal + # to 2 * the sum over the input rows). This hack allows us to save on joins + # but results in slight precision issues. + df = df.round(decimals=6) + return df + + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + + @pytest.mark.parametrize("normalize", ["all", True]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + @sql_count_checker(query_count=0) + def test_normalize_margins_and_values_not_supported( + self, dropna, normalize, aggfunc, a, b, c + ): + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + with pytest.raises( + NotImplementedError, + match='Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.', + ): + pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_values(self, dropna, aggfunc, basic_crosstab_dfs): + query_count = 1 + join_count = 2 if dropna else 5 + native_df = basic_crosstab_dfs[0] + + with SqlCounter(query_count=query_count, join_count=join_count): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + dropna=dropna, + ), + ) + + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs): + query_count = 5 + join_count = 2 if dropna else 5 + native_df, snow_df = basic_crosstab_dfs + + def eval_func(df): + if isinstance(df, pd.DataFrame): + return pd.crosstab( + df["species"], + df["favorite_food"], + values=df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + else: + return native_pd.crosstab( + df["species"], + df["favorite_food"], + values=df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + eval_snowpark_pandas_result( + snow_df, + native_df, + eval_func, + ) + + +@sql_count_checker(query_count=0) +def test_values_unsupported_aggfunc(basic_crosstab_dfs): + native_df = basic_crosstab_dfs[0] + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.", + ): + pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc="median", + dropna=False, + ) + + +@sql_count_checker(query_count=4) +def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs): + # The query count above comes from building the DataFrame + # that we pass in to pivot table. + _, snow_df = basic_crosstab_dfs + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.", + ): + snow_df = pd.crosstab( + snow_df["species"], + snow_df["favorite_food"], + values=snow_df["age"], + aggfunc="median", + dropna=False, + ) + + +@sql_count_checker(query_count=0) +def test_values_aggfunc_one_supplied_should_error(a, b, c): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, aggfunc="sum"), + expect_exception=True, + expect_exception_match="aggfunc cannot be used without values.", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, values=c), + expect_exception=True, + expect_exception_match="values cannot be used without an aggfunc.", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + +@sql_count_checker(query_count=0) +def test_invalid_normalize(a, b): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, normalize="invalid_value"), + expect_exception=True, + expect_exception_match="Not a valid normalize argument", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index 1e72dbd43ca..63a1cbc3bd3 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -45,7 +45,6 @@ def test_unsupported_io(io_method, kwargs): [ ["merge_ordered", {"left": "", "right": ""}], ["value_counts", {"values": ""}], - ["crosstab", {"index": "", "columns": ""}], ["lreshape", {"data": "", "groups": ""}], ["wide_to_long", {"df": "", "stubnames": "", "i": "", "j": ""}], ], From e9ea11a484d080d6b762e4d23a952dbe3f849bc4 Mon Sep 17 00:00:00 2001 From: Afroz Alam Date: Tue, 3 Sep 2024 16:39:53 +0000 Subject: [PATCH 6/7] SNOW-1644950: make use_logical_type option more explicit (#2190) --- CHANGELOG.md | 2 +- src/snowflake/snowpark/session.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 005aaa3a8dd..b9f997b842c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ #### Improvements -- Added support for `ln` in `snowflake.snowpark.functions` +- Improved documentation for `Session.write_pandas` by making `use_logical_type` option more explicit. - Added support for specifying the following to `DataFrameWriter.save_as_table`: - `enable_schema_evolution` - `data_retention_time` diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py index d8a2997fa33..c6f430cc980 100644 --- a/src/snowflake/snowpark/session.py +++ b/src/snowflake/snowpark/session.py @@ -2393,6 +2393,7 @@ def write_pandas( create_temp_table: bool = False, overwrite: bool = False, table_type: Literal["", "temp", "temporary", "transient"] = "", + use_logical_type: Optional[bool] = None, **kwargs: Dict[str, Any], ) -> Table: """Writes a pandas DataFrame to a table in Snowflake and returns a @@ -2429,6 +2430,11 @@ def write_pandas( table_type: The table type of table to be created. The supported values are: ``temp``, ``temporary``, and ``transient``. An empty string means to create a permanent table. Learn more about table types `here `_. + use_logical_type: Boolean that specifies whether to use Parquet logical types when reading the parquet files + for the uploaded pandas dataframe. With this file format option, Snowflake can interpret Parquet logical + types during data loading. To enable Parquet logical types, set use_logical_type as True. Set to None to + use Snowflakes default. For more information, see: + `file format options: `_. Example:: @@ -2505,12 +2511,13 @@ def write_pandas( + (schema + "." if schema else "") + (table_name) ) - signature = inspect.signature(write_pandas) - if not ("use_logical_type" in signature.parameters): - # do not pass use_logical_type if write_pandas does not support it - use_logical_type_passed = kwargs.pop("use_logical_type", None) - if use_logical_type_passed is not None: + if use_logical_type is not None: + signature = inspect.signature(write_pandas) + use_logical_type_supported = "use_logical_type" in signature.parameters + if use_logical_type_supported: + kwargs["use_logical_type"] = use_logical_type + else: # raise warning to upgrade python connector warnings.warn( "use_logical_type will be ignored because current python " From b68d75f318577c771d741eee1abd761b70e64c1e Mon Sep 17 00:00:00 2001 From: Yun Zou Date: Tue, 3 Sep 2024 15:23:55 -0700 Subject: [PATCH 7/7] [SNOW-1556590] Enable test_sql with new compilation stage (#2207) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. SNOW-1556590 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. The test was failing when sql simplification is off, this is fixed with the new query compilation stage, re-enable the test --- tests/integ/test_cte.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/integ/test_cte.py b/tests/integ/test_cte.py index 87a91deab0e..b148ad6a680 100644 --- a/tests/integ/test_cte.py +++ b/tests/integ/test_cte.py @@ -423,10 +423,6 @@ def test_table(session): assert count_number_of_ctes(df_result.queries["queries"][-1]) == 1 -@pytest.mark.skipif( - "config.getoption('disable_sql_simplifier', default=False)", - reason="TODO SNOW-1556590: Re-enable test_sql in test_cte.py when sql simplifier is disabled once new CTE implementation is completed", -) @pytest.mark.parametrize( "query", [ @@ -435,6 +431,11 @@ def test_table(session): ], ) def test_sql(session, query): + if not session._query_compilation_stage_enabled: + pytest.skip( + "CTE query generation without the new query generation doesn't work correctly" + ) + df = session.sql(query).filter(lit(True)) df_result = df.union_all(df).select("*") check_result(session, df_result, expect_cte_optimized=True)