From 4b68a2691d4c54132169467996e8e236e4361c79 Mon Sep 17 00:00:00 2001 From: Naren Krishna Date: Tue, 10 Sep 2024 14:35:19 -0700 Subject: [PATCH 1/4] REFACTOR: Reuse `pd.date_range` for resample computation (#2266) This PR refactors Snowpark pandas `resample` implementation to use `pd.date_range` for the resulting index (complete with missing bins) rather than compute the number of rows required in the output and using a generator function. `pd.date_range` uses a generator under-the-hood. Signed-off-by: Naren Krishna --- .../modin/plugin/_internal/resample_utils.py | 44 +++---------------- 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py index 32278d3a83b..de83e0429bf 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py @@ -2,7 +2,6 @@ # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # -import math from typing import Any, Literal, NoReturn, Optional, Union import pandas as native_pd @@ -18,7 +17,6 @@ dateadd, datediff, lit, - row_number, to_timestamp_ntz, ) from snowflake.snowpark.modin.plugin._internal import join_utils @@ -28,16 +26,8 @@ MatchComparator, join, ) -from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import ( - DataFrameReference, - OrderedDataFrame, -) -from snowflake.snowpark.modin.plugin._internal.utils import ( - generate_snowflake_quoted_identifiers_helper, -) from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.types import DateType, TimestampType -from snowflake.snowpark.window import Window RESAMPLE_INDEX_LABEL = "__resample_index__" @@ -478,39 +468,15 @@ def get_expected_resample_bins_frame( 2020-01-07 2020-01-09 """ - slice_width, slice_unit = rule_to_snowflake_width_and_slice_unit(rule) - - index_column_snowflake_quoted_identifiers = ( - generate_snowflake_quoted_identifiers_helper( - pandas_labels=[RESAMPLE_INDEX_LABEL] - ) - ) - - # row_number ensures there are no gaps in the sequence. - all_resample_bins_col = dateadd( - slice_unit, - (row_number().over(Window.order_by(lit(1))) - 1) * slice_width, - to_timestamp_ntz(lit(start_date)), - ).as_(index_column_snowflake_quoted_identifiers[0]) - - rowcount = math.floor( - (native_pd.to_datetime(end_date) - native_pd.to_datetime(start_date)) - / to_offset(rule) - + 1 - ) - - expected_resample_bins_snowpark_frame = pd.session.generator( - all_resample_bins_col, rowcount=rowcount - ) - + expected_resample_bins_snowpark_frame = pd.date_range( + start_date, end_date, freq=rule + )._query_compiler._modin_frame return InternalFrame.create( - ordered_dataframe=OrderedDataFrame( - DataFrameReference(expected_resample_bins_snowpark_frame) - ), + ordered_dataframe=expected_resample_bins_snowpark_frame.ordered_dataframe, data_column_pandas_labels=[], data_column_snowflake_quoted_identifiers=[], index_column_pandas_labels=[RESAMPLE_INDEX_LABEL], - index_column_snowflake_quoted_identifiers=index_column_snowflake_quoted_identifiers, + index_column_snowflake_quoted_identifiers=expected_resample_bins_snowpark_frame.index_column_snowflake_quoted_identifiers, data_column_pandas_index_names=[None], data_column_types=None, index_column_types=None, From 911d9de5c4cd6cde25c248cce699487d9d992a93 Mon Sep 17 00:00:00 2001 From: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:48:01 -0700 Subject: [PATCH 2/4] SNOW-1659478: Cherrypick telemetry fixes (#2263) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-1659478 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Previous code to de-duplicate telemetry entries was incorrect, and would not apply telemetry to methods defined on `dataframe.py` (currently still vendored) but inherited from upstream `base.py`, such as `DataFrame.copy`. This PR automatically applies telemetry annotations to Series + DataFrame inside `pandas/__init__.py` via the extensions module, meaning the `TelemetryMeta` class, and all annotations in extensions/overrides, should no longer be used. --- .../snowpark/modin/pandas/__init__.py | 59 +++---------- .../snowpark/modin/pandas/dataframe.py | 3 +- .../modin/plugin/_internal/telemetry.py | 4 - .../plugin/extensions/base_extensions.py | 5 -- .../modin/plugin/extensions/base_overrides.py | 9 +- .../plugin/extensions/dataframe_extensions.py | 7 -- .../plugin/extensions/dataframe_overrides.py | 7 -- .../plugin/extensions/series_extensions.py | 7 -- .../plugin/extensions/series_overrides.py | 88 +------------------ tests/integ/modin/test_telemetry.py | 56 +++++++++++- 10 files changed, 68 insertions(+), 177 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/__init__.py b/src/snowflake/snowpark/modin/pandas/__init__.py index 4ea950fe1b4..6960d0eb629 100644 --- a/src/snowflake/snowpark/modin/pandas/__init__.py +++ b/src/snowflake/snowpark/modin/pandas/__init__.py @@ -88,14 +88,13 @@ # TODO: SNOW-851745 make sure add all Snowpark pandas API general functions from modin.pandas import plotting # type: ignore[import] -from modin.pandas.base import BasePandasDataset -from modin.pandas.series import _SERIES_EXTENSIONS_, Series +from modin.pandas.series import Series from snowflake.snowpark.modin.pandas.api.extensions import ( register_dataframe_accessor, register_series_accessor, ) -from snowflake.snowpark.modin.pandas.dataframe import _DATAFRAME_EXTENSIONS_, DataFrame +from snowflake.snowpark.modin.pandas.dataframe import DataFrame from snowflake.snowpark.modin.pandas.general import ( bdate_range, concat, @@ -149,6 +148,7 @@ ) from snowflake.snowpark.modin.plugin._internal.session import SnowpandasSessionHolder from snowflake.snowpark.modin.plugin._internal.telemetry import ( + TELEMETRY_PRIVATE_METHODS, try_add_telemetry_to_attribute, ) from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP @@ -166,24 +166,6 @@ read_json, ) -# Record which attributes are defined on an upstream object, and which are defined on a vendored -# object (currently just dataframe.py), and determine when adding telemetry is necessary. -# This must be checked before overrides are applied. -_attrs_defined_on_modin_series = set() -for attr_name, attr_value in Series.__dict__.items(): - base_value = BasePandasDataset.__dict__.get(attr_name, None) - if base_value is None or attr_value != base_value: - _attrs_defined_on_modin_series.add(attr_name) - -_attrs_defined_on_dataframe = ( - set() -) # TODO: SNOW-1063346 revisit when dataframe.py is removed -for attr_name, attr_value in DataFrame.__dict__.items(): - base_value = BasePandasDataset.__dict__.get(attr_name, None) - if base_value is None or attr_value != base_value: - _attrs_defined_on_dataframe.add(attr_name) - - # base overrides occur before subclass overrides in case subclasses override a base method import snowflake.snowpark.modin.plugin.extensions.base_extensions # isort: skip # noqa: E402,F401 import snowflake.snowpark.modin.plugin.extensions.base_overrides # isort: skip # noqa: E402,F401 @@ -203,48 +185,27 @@ modin.pandas.base._ATTRS_NO_LOOKUP.update(_ATTRS_NO_LOOKUP) -# For any method defined on Series/DF, add telemetry to it if it meets all of the following conditions: -# 1. The method was defined directly on an upstream class (_attrs_defined_on_modin_base, _attrs_defined_on_modin_series) -# 1a. (DataFrame only): The method is not overridden by DataFrame (not applicable to Series, since we use the upstream version) -# 2. The method is not overridden by an extensions module -# 3. The method name does not start with an _ -_base_telemetry_added_attrs = set() +# For any method defined on Series/DF, add telemetry to it if it: +# 1. Is defined directly on an upstream class +# 2. The method name does not start with an _, or is in TELEMETRY_PRIVATE_METHODS -_series_ext = _SERIES_EXTENSIONS_.copy() for attr_name in dir(Series): # Since Series is defined in upstream Modin, all of its members were either defined upstream # or overridden by extension. - if attr_name not in _series_ext and not attr_name.startswith("_"): + if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: register_series_accessor(attr_name)( try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name)) ) - if attr_name not in _attrs_defined_on_modin_series: - # attribute was defined on BasePandasDataset and inherited, so don't override it again - # for DataFrame - _base_telemetry_added_attrs.add(attr_name) # TODO: SNOW-1063346 # Since we still use the vendored version of DataFrame and the overrides for the top-level # namespace haven't been performed yet, we need to set properties on the vendored version -_dataframe_ext = _DATAFRAME_EXTENSIONS_.copy() for attr_name in dir(DataFrame): - if ( - attr_name not in _attrs_defined_on_dataframe - and attr_name not in _dataframe_ext - and not attr_name.startswith("_") - ): - # If this method was inherited from BasePandasDataset and telemetry was already added via - # Series, register the override but don't re-wrap the method in the telemetry annotation. - # If we don't do this check, we will end up double-reporting telemetry on some methods. - original_attr = getattr(DataFrame, attr_name) - new_attr = ( - original_attr - if attr_name in _base_telemetry_added_attrs - else try_add_telemetry_to_attribute(attr_name, original_attr) + if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + register_dataframe_accessor(attr_name)( + try_add_telemetry_to_attribute(attr_name, getattr(DataFrame, attr_name)) ) - register_dataframe_accessor(attr_name)(new_attr) - _base_telemetry_added_attrs.add(attr_name) def __getattr__(name: str) -> Any: diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 27ad1831886..c5757160486 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -91,7 +91,6 @@ replace_external_data_keys_with_empty_pandas_series, replace_external_data_keys_with_query_compiler, ) -from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( @@ -138,7 +137,7 @@ ], apilink="pandas.DataFrame", ) -class DataFrame(BasePandasDataset, metaclass=TelemetryMeta): +class DataFrame(BasePandasDataset): _pandas_class = pandas.DataFrame def __init__( diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py index f0853a2f71e..d38584c14de 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py @@ -9,7 +9,6 @@ from enum import Enum, unique from typing import Any, Callable, Optional, TypeVar, Union, cast -import modin.pandas from typing_extensions import ParamSpec import snowflake.snowpark.session @@ -543,8 +542,6 @@ class TelemetryMeta(type): def __new__( cls, name: str, bases: tuple, attrs: dict[str, Any] ) -> Union[ - "modin.pandas.Series", - "snowflake.snowpark.modin.pandas.dataframe.DataFrame", "snowflake.snowpark.modin.pandas.groupby.DataFrameGroupBy", "snowflake.snowpark.modin.pandas.resample.Resampler", "snowflake.snowpark.modin.pandas.window.Window", @@ -558,7 +555,6 @@ def __new__( with ``snowpark_pandas_telemetry_api_usage`` telemetry decorator. Method arguments returned by _get_kwargs_telemetry are collected otherwise set telemetry_args=list(). TelemetryMeta is only set as the metaclass of: - snowflake.snowpark.modin.pandas.dataframe.DataFrame, snowflake.snowpark.modin.pandas.groupby.DataFrameGroupBy, snowflake.snowpark.modin.pandas.resample.Resampler, snowflake.snowpark.modin.pandas.window.Window, diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py b/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py index 496136d736e..01671744efc 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py @@ -6,15 +6,10 @@ File containing BasePandasDataset APIs defined in Snowpark pandas but not the Modin API layer. """ -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, -) - from .base_overrides import register_base_override @register_base_override("__array_function__") -@snowpark_pandas_telemetry_method_decorator def __array_function__(self, func: callable, types: tuple, args: tuple, kwargs: dict): """ Apply the `func` to the `BasePandasDataset`. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index f0cddff8058..e10b3687178 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -73,10 +73,6 @@ raise_if_native_pandas_objects, validate_and_try_convert_agg_func_arg_func_to_str, ) -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, - try_add_telemetry_to_attribute, -) from snowflake.snowpark.modin.plugin._typing import ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -97,7 +93,6 @@ def register_base_override(method_name: str): """ def decorator(base_method: Any): - base_method = try_add_telemetry_to_attribute(method_name, base_method) parent_method = getattr(BasePandasDataset, method_name, None) if isinstance(parent_method, property): parent_method = parent_method.fget @@ -125,9 +120,7 @@ def decorator(base_method: Any): def register_base_not_implemented(): def decorator(base_method: Any): - func = snowpark_pandas_telemetry_method_decorator( - base_not_implemented()(base_method) - ) + func = base_not_implemented()(base_method) register_series_accessor(base_method.__name__)(func) register_dataframe_accessor(base_method.__name__)(func) return func diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_extensions.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_extensions.py index c98be89caa4..394ed64bdeb 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_extensions.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_extensions.py @@ -16,9 +16,6 @@ from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame from snowflake.snowpark.modin import pandas as pd # noqa: F401 from snowflake.snowpark.modin.pandas.api.extensions import register_dataframe_accessor -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, -) from snowflake.snowpark.modin.plugin.extensions.utils import add_cache_result_docstring @@ -27,7 +24,6 @@ # pandas DataFrame. # Implementation note: Arguments names and types are kept consistent with pandas.DataFrame.to_sql @register_dataframe_accessor("to_snowflake") -@snowpark_pandas_telemetry_method_decorator def to_snowflake( self, name: Union[str, Iterable[str]], @@ -67,7 +63,6 @@ def to_snowflake( @register_dataframe_accessor("to_snowpark") -@snowpark_pandas_telemetry_method_decorator def to_snowpark( self, index: bool = True, index_label: Optional[IndexLabel] = None ) -> SnowparkDataFrame: @@ -200,7 +195,6 @@ def to_snowpark( @register_dataframe_accessor("to_pandas") -@snowpark_pandas_telemetry_method_decorator def to_pandas( self, *, @@ -244,7 +238,6 @@ def to_pandas( @register_dataframe_accessor("cache_result") @add_cache_result_docstring -@snowpark_pandas_telemetry_method_decorator def cache_result(self, inplace: bool = False) -> Optional[pd.DataFrame]: """ Persists the current Snowpark pandas DataFrame to a temporary table that lasts the duration of the session. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 3f11d544fbe..5ce836061ab 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -18,9 +18,6 @@ from snowflake.snowpark.modin.plugin._internal.aggregation_utils import ( is_snowflake_agg_func, ) -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, -) from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.modin.utils import _inherit_docstrings, validate_int_kwarg @@ -28,7 +25,6 @@ @_inherit_docstrings(native_pd.DataFrame.memory_usage, apilink="pandas.DataFrame") @register_dataframe_accessor("memory_usage") -@snowpark_pandas_telemetry_method_decorator def memory_usage(self, index: bool = True, deep: bool = False) -> Any: """ Memory Usage (Dummy Information) @@ -68,7 +64,6 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Any: @register_dataframe_accessor("plot") @property -@snowpark_pandas_telemetry_method_decorator def plot( self, x=None, @@ -115,7 +110,6 @@ def plot( # Upstream modin defines sum differently for series/DF, but we use the same implementation for both. @register_dataframe_accessor("sum") -@snowpark_pandas_telemetry_method_decorator def sum( self, axis: Union[Axis, None] = None, @@ -137,7 +131,6 @@ def sum( @register_dataframe_accessor("transform") -@snowpark_pandas_telemetry_method_decorator def transform( self, func: PythonFuncType, axis: Axis = 0, *args: Any, **kwargs: Any ) -> DataFrame: # noqa: PR01, RT01, D200 diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_extensions.py b/src/snowflake/snowpark/modin/plugin/extensions/series_extensions.py index bc1b310bf50..f7bba4c743a 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_extensions.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_extensions.py @@ -16,14 +16,10 @@ from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame from snowflake.snowpark.modin import pandas as pd # noqa: F401 from snowflake.snowpark.modin.pandas.api.extensions import register_series_accessor -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, -) from snowflake.snowpark.modin.plugin.extensions.utils import add_cache_result_docstring @register_series_accessor("to_snowflake") -@snowpark_pandas_telemetry_method_decorator def to_snowflake( self, name: Union[str, Iterable[str]], @@ -62,7 +58,6 @@ def to_snowflake( @register_series_accessor("to_snowpark") -@snowpark_pandas_telemetry_method_decorator def to_snowpark( self, index: bool = True, index_label: Optional[IndexLabel] = None ) -> SnowparkDataFrame: @@ -172,7 +167,6 @@ def to_snowpark( @register_series_accessor("to_pandas") -@snowpark_pandas_telemetry_method_decorator def to_pandas( self, *, @@ -207,7 +201,6 @@ def to_pandas( @register_series_accessor("cache_result") @add_cache_result_docstring -@snowpark_pandas_telemetry_method_decorator def cache_result(self, inplace: bool = False) -> Optional[pd.Series]: """ Persists the Snowpark pandas Series to a temporary table for the duration of the session. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 645109120c5..5011defa685 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -55,10 +55,6 @@ is_scalar, try_convert_index_to_native, ) -from snowflake.snowpark.modin.plugin._internal.telemetry import ( - snowpark_pandas_telemetry_method_decorator, - try_add_telemetry_to_attribute, -) from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -83,9 +79,7 @@ def register_series_not_implemented(): def decorator(base_method: Any): - func = snowpark_pandas_telemetry_method_decorator( - series_not_implemented()(base_method) - ) + func = series_not_implemented()(base_method) register_series_accessor(base_method.__name__)(func) return func @@ -429,7 +423,6 @@ def __init__( # usage isn't meaningful and is set to always return 0. @_inherit_docstrings(native_pd.Series.memory_usage, apilink="pandas.Series") @register_series_accessor("memory_usage") -@snowpark_pandas_telemetry_method_decorator def memory_usage(self, index: bool = True, deep: bool = False) -> int: """ Return zero bytes for memory_usage @@ -441,7 +434,6 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> int: # Snowpark pandas has slightly different type validation from upstream modin. @_inherit_docstrings(native_pd.Series.isin, apilink="pandas.Series") @register_series_accessor("isin") -@snowpark_pandas_telemetry_method_decorator def isin(self, values: set | ListLike) -> Series: """ Whether elements in Series are contained in `values`. @@ -529,7 +521,6 @@ def isin(self, values: set | ListLike) -> Series: # Snowpark pandas raises a warning before materializing data and passing to `plot`. @register_series_accessor("plot") @property -@snowpark_pandas_telemetry_method_decorator def plot( self, kind="line", @@ -570,7 +561,6 @@ def plot( # Upstream Modin has a bug binary operators (except add/radd, ) don't respect fill_value: # https://github.com/modin-project/modin/issues/7381 @register_series_accessor("sub") -@snowpark_pandas_telemetry_method_decorator def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return subtraction of Series and `other`, element-wise (binary operator `sub`). @@ -583,7 +573,6 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("rsub") -@snowpark_pandas_telemetry_method_decorator def rsub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return subtraction of series and `other`, element-wise (binary operator `rsub`). @@ -595,7 +584,6 @@ def rsub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("mul") -@snowpark_pandas_telemetry_method_decorator def mul(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return multiplication of series and `other`, element-wise (binary operator `mul`). @@ -608,7 +596,6 @@ def mul(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("rmul") -@snowpark_pandas_telemetry_method_decorator def rmul(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return multiplication of series and `other`, element-wise (binary operator `mul`). @@ -620,7 +607,6 @@ def rmul(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("truediv") -@snowpark_pandas_telemetry_method_decorator def truediv(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return floating division of series and `other`, element-wise (binary operator `truediv`). @@ -636,7 +622,6 @@ def truediv(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT @register_series_accessor("rtruediv") -@snowpark_pandas_telemetry_method_decorator def rtruediv( self, other, level=None, fill_value=None, axis=0 ): # noqa: PR01, RT01, D200 @@ -653,7 +638,6 @@ def rtruediv( @register_series_accessor("floordiv") -@snowpark_pandas_telemetry_method_decorator def floordiv( self, other, level=None, fill_value=None, axis=0 ): # noqa: PR01, RT01, D200 @@ -667,7 +651,6 @@ def floordiv( @register_series_accessor("rfloordiv") -@snowpark_pandas_telemetry_method_decorator def rfloordiv( self, other, level=None, fill_value=None, axis=0 ): # noqa: PR01, RT01, D200 @@ -681,7 +664,6 @@ def rfloordiv( @register_series_accessor("mod") -@snowpark_pandas_telemetry_method_decorator def mod(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return Modulo of series and `other`, element-wise (binary operator `mod`). @@ -691,7 +673,6 @@ def mod(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("rmod") -@snowpark_pandas_telemetry_method_decorator def rmod(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return modulo of series and `other`, element-wise (binary operator `rmod`). @@ -703,7 +684,6 @@ def rmod(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("pow") -@snowpark_pandas_telemetry_method_decorator def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return exponential power of series and `other`, element-wise (binary operator `pow`). @@ -713,7 +693,6 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, @register_series_accessor("rpow") -@snowpark_pandas_telemetry_method_decorator def rpow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, D200 """ Return exponential power of series and `other`, element-wise (binary operator `rpow`). @@ -726,7 +705,6 @@ def rpow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__add__") -@snowpark_pandas_telemetry_method_decorator def __add__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.add(right) @@ -734,7 +712,6 @@ def __add__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__radd__") -@snowpark_pandas_telemetry_method_decorator def __radd__(self, left): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.radd(left) @@ -742,7 +719,6 @@ def __radd__(self, left): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__and__") -@snowpark_pandas_telemetry_method_decorator def __and__(self, other): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__and__(other) @@ -750,7 +726,6 @@ def __and__(self, other): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rand__") -@snowpark_pandas_telemetry_method_decorator def __rand__(self, other): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__rand__(other) @@ -758,7 +733,6 @@ def __rand__(self, other): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__divmod__") -@snowpark_pandas_telemetry_method_decorator def __divmod__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.divmod(right) @@ -766,7 +740,6 @@ def __divmod__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rdivmod__") -@snowpark_pandas_telemetry_method_decorator def __rdivmod__(self, left): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.rdivmod(left) @@ -774,7 +747,6 @@ def __rdivmod__(self, left): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__floordiv__") -@snowpark_pandas_telemetry_method_decorator def __floordiv__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.floordiv(right) @@ -782,7 +754,6 @@ def __floordiv__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rfloordiv__") -@snowpark_pandas_telemetry_method_decorator def __rfloordiv__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.rfloordiv(right) @@ -790,7 +761,6 @@ def __rfloordiv__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__mod__") -@snowpark_pandas_telemetry_method_decorator def __mod__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.mod(right) @@ -798,7 +768,6 @@ def __mod__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rmod__") -@snowpark_pandas_telemetry_method_decorator def __rmod__(self, left): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.rmod(left) @@ -806,7 +775,6 @@ def __rmod__(self, left): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__mul__") -@snowpark_pandas_telemetry_method_decorator def __mul__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.mul(right) @@ -814,7 +782,6 @@ def __mul__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rmul__") -@snowpark_pandas_telemetry_method_decorator def __rmul__(self, left): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.rmul(left) @@ -822,7 +789,6 @@ def __rmul__(self, left): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__or__") -@snowpark_pandas_telemetry_method_decorator def __or__(self, other): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__or__(other) @@ -830,7 +796,6 @@ def __or__(self, other): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__ror__") -@snowpark_pandas_telemetry_method_decorator def __ror__(self, other): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__ror__(other) @@ -838,7 +803,6 @@ def __ror__(self, other): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__xor__") -@snowpark_pandas_telemetry_method_decorator def __xor__(self, other): # pragma: no cover # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__xor__(other) @@ -846,7 +810,6 @@ def __xor__(self, other): # pragma: no cover # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rxor__") -@snowpark_pandas_telemetry_method_decorator def __rxor__(self, other): # pragma: no cover # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return super(Series, self).__rxor__(other) @@ -854,7 +817,6 @@ def __rxor__(self, other): # pragma: no cover # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__pow__") -@snowpark_pandas_telemetry_method_decorator def __pow__(self, right): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.pow(right) @@ -862,7 +824,6 @@ def __pow__(self, right): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rpow__") -@snowpark_pandas_telemetry_method_decorator def __rpow__(self, left): # TODO: SNOW-1063347: Modin upgrade - modin.pandas.Series functions return self.rpow(left) @@ -870,28 +831,24 @@ def __rpow__(self, left): # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__sub__") -@snowpark_pandas_telemetry_method_decorator def __sub__(self, right): return self.sub(right) # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rsub__") -@snowpark_pandas_telemetry_method_decorator def __rsub__(self, left): return self.rsub(left) # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__truediv__") -@snowpark_pandas_telemetry_method_decorator def __truediv__(self, right): return self.truediv(right) # Modin defaults to pandas for binary operators against native pandas/list objects. @register_series_accessor("__rtruediv__") -@snowpark_pandas_telemetry_method_decorator def __rtruediv__(self, left): return self.rtruediv(left) @@ -906,7 +863,6 @@ def __rtruediv__(self, left): # Upstream Modin does validation on func that Snowpark pandas does not. @register_series_accessor("aggregate") -@snowpark_pandas_telemetry_method_decorator def aggregate( self, func: AggFuncType = None, axis: Axis = 0, *args: Any, **kwargs: Any ): @@ -919,7 +875,6 @@ def aggregate( # Upstream Modin does a significant amount of frontend manipulation and may default to pandas. @register_series_accessor("apply") -@snowpark_pandas_telemetry_method_decorator def apply( self, func: AggFuncType, @@ -947,7 +902,6 @@ def apply( # Upstream modin calls to_pandas on the series. @register_series_accessor("map") -@snowpark_pandas_telemetry_method_decorator def map( self, arg: Callable | Mapping | Series, @@ -962,7 +916,6 @@ def map( # Snowpark pandas does different validation than upstream Modin. @register_series_accessor("argmax") -@snowpark_pandas_telemetry_method_decorator def argmax(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200 """ Return int position of the largest value in the Series. @@ -983,7 +936,6 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, # Snowpark pandas does different validation than upstream Modin. @register_series_accessor("argmin") -@snowpark_pandas_telemetry_method_decorator def argmin(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200 """ Return int position of the smallest value in the Series. @@ -1005,7 +957,6 @@ def argmin(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, # Modin uses the same implementation as Snowpark pandas starting form 0.31.0. # Until then, upstream Modin does not convert arguments in the caselist into query compilers. @register_series_accessor("case_when") -@snowpark_pandas_telemetry_method_decorator def case_when(self, caselist) -> Series: # noqa: PR01, RT01, D200 """ Replace values where the conditions are True. @@ -1026,7 +977,6 @@ def case_when(self, caselist) -> Series: # noqa: PR01, RT01, D200 # Upstream Modin has a bug: # https://github.com/modin-project/modin/issues/7334 @register_series_accessor("compare") -@snowpark_pandas_telemetry_method_decorator def compare( self, other: Series, @@ -1073,7 +1023,6 @@ def compare( # Snowpark pandas does not respect `ignore_index`, and upstream Modin does not respect `how`. @register_series_accessor("dropna") -@snowpark_pandas_telemetry_method_decorator def dropna( self, *, @@ -1091,7 +1040,6 @@ def dropna( # Upstream Modin does not preserve the series name. # https://github.com/modin-project/modin/issues/7375 @register_series_accessor("duplicated") -@snowpark_pandas_telemetry_method_decorator def duplicated(self, keep: DropKeep = "first"): """ Indicate duplicate Series values. @@ -1109,7 +1057,6 @@ def duplicated(self, keep: DropKeep = "first"): # Modin has a separate definition in both series.py and base.py. In general, we cannot force base_overrides # to override both methods in case the series.py version calls the superclass method. @register_series_accessor("sum") -@snowpark_pandas_telemetry_method_decorator def sum( self, axis: Axis | None = None, @@ -1132,7 +1079,6 @@ def sum( # Snowpark pandas handles kwargs differently than modin. @register_series_accessor("std") -@snowpark_pandas_telemetry_method_decorator def std( self, axis: Axis | None = None, @@ -1157,7 +1103,6 @@ def std( # Snowpark pandas handles kwargs differently than modin. @register_series_accessor("var") -@snowpark_pandas_telemetry_method_decorator def var( self, axis: Axis | None = None, @@ -1233,7 +1178,6 @@ def as_unordered(self, inplace=False): @register_series_accessor("cat") @property -@snowpark_pandas_telemetry_method_decorator def cat(self) -> CategoryMethods: return CategoryMethods(self) @@ -1241,7 +1185,6 @@ def cat(self) -> CategoryMethods: # Snowpark pandas performs type validation that Modin does not. @register_series_accessor("dt") @property -@snowpark_pandas_telemetry_method_decorator def dt(self): # noqa: RT01, D200 """ Accessor object for datetimelike properties of the Series values. @@ -1262,7 +1205,6 @@ def dt(self): # noqa: RT01, D200 # Avoid naming the object "str" to avoid overwriting Python built-in "str". @register_series_accessor("str") @property -@snowpark_pandas_telemetry_method_decorator def _str(self): # noqa: RT01, D200 """ Vectorized string functions for Series and Index. @@ -1297,25 +1239,18 @@ def _set_name(self, name): self._update_inplace(new_query_compiler=self._query_compiler.set_columns(columns)) -register_series_accessor("name")( - try_add_telemetry_to_attribute( - "name", - property(Series._get_name, _set_name), - ) -) +register_series_accessor("name")(property(Series._get_name, _set_name)) # Modin uses len(self.index) instead of len(self), which may incur an extra query. @register_series_accessor("empty") @property -@snowpark_pandas_telemetry_method_decorator def empty(self) -> bool: return len(self) == 0 # Upstream modin uses squeeze_self instead of self_is_series. @register_series_accessor("fillna") -@snowpark_pandas_telemetry_method_decorator def fillna( self, value: Hashable | Mapping | Series = None, @@ -1345,7 +1280,6 @@ def fillna( # Snowpark pandas defines a custom GroupBy object @register_series_accessor("groupby") -@snowpark_pandas_telemetry_method_decorator def groupby( self, by=None, @@ -1389,7 +1323,6 @@ def groupby( # Snowpark pandas should avoid defaulting to pandas (the current Modin upstream version needs # Index._is_memory_usage_qualified to be implemented). @register_series_accessor("info") -@snowpark_pandas_telemetry_method_decorator def info( self, verbose: bool | None = None, @@ -1429,7 +1362,6 @@ def _qcut( # Snowpark pandas ignores `inplace` (possibly an error?) and performs additional validation. @register_series_accessor("replace") -@snowpark_pandas_telemetry_method_decorator def replace( self, to_replace=None, @@ -1463,7 +1395,6 @@ def replace( # Upstream Modin reset_index produces an extra query and performs a relative import of DataFrame. @register_series_accessor("reset_index") -@snowpark_pandas_telemetry_method_decorator def reset_index( self, level=None, @@ -1501,7 +1432,6 @@ def reset_index( # Snowpark pandas performs additional type validation. @register_series_accessor("set_axis") -@snowpark_pandas_telemetry_method_decorator def set_axis( self, labels: IndexLabel, @@ -1525,7 +1455,6 @@ def set_axis( # Modin does a relative import (from .dataframe import DataFrame). We should revisit this once # our vendored copy of DataFrame is removed. @register_series_accessor("rename") -@snowpark_pandas_telemetry_method_decorator def rename( self, index: Renamer | Hashable | None = None, @@ -1578,7 +1507,6 @@ def rename( # Modin does a relative import (from .dataframe import DataFrame). We should revisit this once # our vendored copy of DataFrame is removed. @register_series_accessor("sort_values") -@snowpark_pandas_telemetry_method_decorator def sort_values( self, axis: Axis = 0, @@ -1627,7 +1555,6 @@ def sort_values( # our vendored copy of DataFrame is removed. # Modin also defaults to pandas for some arguments for unstack @register_series_accessor("unstack") -@snowpark_pandas_telemetry_method_decorator def unstack( self, level: int | str | list = -1, @@ -1657,7 +1584,6 @@ def unstack( # Upstream Modin defaults at the frontend layer. @register_series_accessor("where") -@snowpark_pandas_telemetry_method_decorator def where( self, cond: DataFrame | Series | Callable | AnyArrayLike, @@ -1681,7 +1607,6 @@ def where( # Upstream modin defaults to pandas for some arguments. @register_series_accessor("value_counts") -@snowpark_pandas_telemetry_method_decorator def value_counts( self, normalize: bool = False, @@ -1711,7 +1636,6 @@ def value_counts( # https://github.com/pandas-dev/pandas/issues/54806 # The parameter is ignored in upstream modin, but Snowpark pandas wants to error if the parameter is given. @register_series_accessor("shift") -@snowpark_pandas_telemetry_method_decorator def shift( self, periods: int | Sequence[int] = 1, @@ -1733,7 +1657,6 @@ def shift( # Snowpark pandas uses len(self) instead of len(index), saving a query in some cases. @register_series_accessor("squeeze") -@snowpark_pandas_telemetry_method_decorator def squeeze(self, axis: Axis | None = None): """ Squeeze 1 dimensional axis objects into scalars. @@ -1784,7 +1707,6 @@ def _to_datetime(self, **kwargs): # Modin uses the query compiler to_list method, which we should try to implement instead of calling self.values. @register_series_accessor("to_list") -@snowpark_pandas_telemetry_method_decorator def to_list(self) -> list: """ Return a list of the values. @@ -1797,7 +1719,6 @@ def to_list(self) -> list: @register_series_accessor("to_dict") -@snowpark_pandas_telemetry_method_decorator def to_dict(self, into: type[dict] = dict) -> dict: """ Convert Series to {label -> value} dict or dict-like object. @@ -1846,7 +1767,6 @@ def _create_or_update_from_compiler(self, new_query_compiler, inplace=False): # Modin does a relative import (from .dataframe import DataFrame), so until we stop using the vendored # version of DataFrame, we must keep this override. @register_series_accessor("to_frame") -@snowpark_pandas_telemetry_method_decorator def to_frame(self, name: Hashable = no_default) -> DataFrame: # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. @@ -1865,7 +1785,6 @@ def to_frame(self, name: Hashable = no_default) -> DataFrame: # noqa: PR01, RT0 @register_series_accessor("to_numpy") -@snowpark_pandas_telemetry_method_decorator def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -1891,7 +1810,6 @@ def to_numpy( # Snowpark pandas has the extra `statement_params` argument. @register_series_accessor("_to_pandas") -@snowpark_pandas_telemetry_method_decorator def _to_pandas( self, *, @@ -1922,7 +1840,6 @@ def _to_pandas( # Snowpark pandas does more validation and error checking than upstream Modin. @register_series_accessor("__setitem__") -@snowpark_pandas_telemetry_method_decorator def __setitem__(self, key, value): """ Set `value` identified by `key` in the Series. @@ -2070,7 +1987,6 @@ def __setitem__(self, key, value): # Snowpark pandas uses the query compiler build_repr_df method to minimize queries, while upstream # modin calls BasePandasDataset.build_repr_df @register_series_accessor("__repr__") -@snowpark_pandas_telemetry_method_decorator def __repr__(self): """ Return a string representation for a particular Series. diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index 80317357af9..ce9e1caf328 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -562,15 +562,67 @@ def test_telemetry_copy(): {"name": "Series.property.name_set"}, {"name": "Series.BasePandasDataset.copy"}, ] + # DataFrame is currently still vendored, and inherits copy from BasePandasDataset + df = pd.DataFrame([1]) + copied_df = df.copy() + assert df._query_compiler.snowpark_pandas_api_calls == [] + assert copied_df._query_compiler.snowpark_pandas_api_calls == [ + {"name": "DataFrame.BasePandasDataset.copy"} + ] @sql_count_checker(query_count=0) def test_telemetry_series_describe(): - # describe() is defined in upstream Modin's Series class, and not overridden by Snowpark pandas. + # describe() is defined in upstream Modin's Series class, and calls super().describe(). + # Snowpark pandas overrides the BasePandasDataset superclass implementation, but telemetry on it + # is not recorded because we only add telemetry to the implementation of the child class. s = pd.Series([1, 2, 3, 4]) result = s.describe() assert result._query_compiler.snowpark_pandas_api_calls == [ {"name": "Series.property.name_set"}, - {"name": "Series.describe"}, {"name": "Series.Series.describe"}, ] + + +@sql_count_checker(query_count=0) +def test_telemetry_series_isin(): + # isin is overridden in both series_overrides.py and base_overrides.py + # This test ensures we only report telemetry for one + s = pd.Series([1, 2, 3, 4]) + result = s.isin([1]) + assert result._query_compiler.snowpark_pandas_api_calls == [ + {"name": "Series.property.name_set"}, + {"name": "Series.isin"}, + ] + + +@sql_count_checker(query_count=0) +def test_telemetry_quantile(): + # quantile is overridden in base_overrides.py + s = pd.Series([1, 2, 3, 4]) + result_s = s.quantile(q=[0.1, 0.2]) + assert result_s._query_compiler.snowpark_pandas_api_calls == [ + {"name": "Series.property.name_set"}, + {"argument": ["q"], "name": "Series.Series.quantile"}, + ] + df = pd.DataFrame([1, 2, 3, 4]) + result_df = df.quantile(q=[0.1, 0.2]) + assert result_df._query_compiler.snowpark_pandas_api_calls == [ + {"argument": ["q"], "name": "DataFrame.DataFrame.quantile"}, + ] + + +@sql_count_checker(query_count=2) +def test_telemetry_cache_result(): + # cache_result exists only in Snowpark pandas + s = pd.Series([1, 2, 3, 4]) + result_s = s.cache_result() + assert result_s._query_compiler.snowpark_pandas_api_calls == [ + {"name": "Series.property.name_set"}, + {"name": "Series.cache_result"}, + ] + df = pd.DataFrame([1, 2, 3, 4]) + result_df = df.cache_result() + assert result_df._query_compiler.snowpark_pandas_api_calls == [ + {"name": "DataFrame.cache_result"}, + ] From d135045eaf8ca15ff071ac8f40892d5b1314a934 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Tue, 10 Sep 2024 15:25:55 -0700 Subject: [PATCH 3/4] SNOW-1658145 Fix the supported documentation pages for Index (#2259) Fixes [SNOW-1658145](https://snowflakecomputing.atlassian.net/browse/SNOW-1658145) I also verified that TimedeltaIndex and DatetimeIndex APIs have correct supported documentation. [SNOW-1658145]: https://snowflakecomputing.atlassian.net/browse/SNOW-1658145?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --- docs/source/modin/indexing.rst | 43 +------- .../modin/supported/index_supported.rst | 4 +- .../snowpark/modin/pandas/dataframe.py | 2 +- .../modin/plugin/extensions/base_overrides.py | 7 +- .../modin/plugin/extensions/datetime_index.py | 6 + .../snowpark/modin/plugin/extensions/index.py | 103 ++---------------- tests/integ/modin/frame/test_dropna.py | 16 ++- tests/integ/modin/index/test_index_methods.py | 61 +++++++---- tests/integ/modin/test_unimplemented.py | 7 ++ 9 files changed, 76 insertions(+), 173 deletions(-) diff --git a/docs/source/modin/indexing.rst b/docs/source/modin/indexing.rst index 80ceba61bec..3ba712cb53b 100644 --- a/docs/source/modin/indexing.rst +++ b/docs/source/modin/indexing.rst @@ -24,7 +24,6 @@ Index Index.is_monotonic_decreasing Index.is_unique Index.has_duplicates - Index.hasnans Index.dtype Index.shape Index.name @@ -34,6 +33,8 @@ Index Index.empty Index.T Index.nlevels + Index.array + Index.str .. rubric:: Snowflake Specific @@ -52,17 +53,11 @@ Index Index.argmin Index.argmax Index.copy - Index.delete - Index.drop - Index.drop_duplicates - Index.duplicated Index.equals Index.identical - Index.insert Index.is_boolean Index.is_floating Index.is_integer - Index.is_interval Index.is_numeric Index.is_object Index.item @@ -73,8 +68,6 @@ Index Index.unique Index.nunique Index.value_counts - Index.array - Index.str .. rubric:: Compatibility with MultiIndex @@ -83,17 +76,6 @@ Index Index.set_names -.. rubric:: Missing values - -.. autosummary:: - :toctree: pandas_api/ - - Index.fillna - Index.dropna - Index.isna - Index.notna - - .. rubric:: Conversion .. autosummary:: @@ -113,27 +95,6 @@ Index Index.sort_values -.. rubric:: Combining / joining / set operations - -.. autosummary:: - :toctree: pandas_api/ - - Index.append - Index.join - Index.intersection - Index.union - Index.difference - -.. rubric:: Selecting - -.. autosummary:: - :toctree: pandas_api/ - - Index.get_indexer_for - Index.get_level_values - Index.isin - Index.slice_indexer - .. _api.datetimeindex: DatetimeIndex diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst index 0c413c201fb..f33e741de5b 100644 --- a/docs/source/modin/supported/index_supported.rst +++ b/docs/source/modin/supported/index_supported.rst @@ -54,9 +54,9 @@ Attributes +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nlevels`` | P | Only single Index supported. | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``array`` | N | | +| ``array`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``str`` | D | | +| ``str`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index c5757160486..83893e83e9c 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -2362,7 +2362,7 @@ def set_index( # this needs to pull all index which is inefficient if verify_integrity and not new_query_compiler.index.is_unique: duplicates = new_query_compiler.index[ - new_query_compiler.index.duplicated() + new_query_compiler.index.to_pandas().duplicated() ].unique() raise ValueError(f"Index has duplicate keys: {duplicates}") diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index e10b3687178..aeca9d6e305 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -853,12 +853,7 @@ def _dropna( if how not in ["any", "all"]: raise ValueError("invalid how option: %s" % how) if subset is not None: - if axis == 1: - indices = self.index.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - else: + if axis != 1: indices = self.columns.get_indexer_for(subset) check = indices == -1 if check.any(): diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index 2b7b4011770..7be7adb54c1 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -1133,6 +1133,7 @@ def round( frequency like 'S' (second) not 'ME' (month end). See frequency aliases for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. Only relevant for DatetimeIndex: - 'infer' will attempt to infer fall dst-transition hours based on @@ -1145,6 +1146,7 @@ def round( times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1199,6 +1201,7 @@ def floor( frequency like 'S' (second) not 'ME' (month end). See frequency aliases for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. Only relevant for DatetimeIndex: - 'infer' will attempt to infer fall dst-transition hours based on @@ -1211,6 +1214,7 @@ def floor( times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. @@ -1265,6 +1269,7 @@ def ceil( frequency like 'S' (second) not 'ME' (month end). See frequency aliases for a list of possible `freq` values. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + This parameter is only supported for 'raise'. Only relevant for DatetimeIndex: - 'infer' will attempt to infer fall dst-transition hours based on @@ -1277,6 +1282,7 @@ def ceil( times. nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + This parameter is only supported for 'raise'. A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index 6813f277f69..12710224de7 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -1101,6 +1101,7 @@ def delete(self) -> None: """ # TODO: SNOW-1458146 implement delete + @index_not_implemented() def drop( self, labels: Any, @@ -1124,16 +1125,8 @@ def drop( ------ KeyError If all labels are not found in the selected axis - - Examples - -------- - >>> idx = pd.Index(['a', 'b', 'c']) - >>> idx.drop(['a']) - Index(['b', 'c'], dtype='object') """ # TODO: SNOW-1458146 implement drop - WarningMessage.index_to_pandas_warning("drop") - return self.__constructor__(self.to_pandas().drop(labels=labels, errors=errors)) @index_not_implemented() def drop_duplicates(self) -> None: @@ -1159,6 +1152,7 @@ def drop_duplicates(self) -> None: """ # TODO: SNOW-1458147 implement drop_duplicates + @index_not_implemented() def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray: """ Indicate duplicate index values. @@ -1187,35 +1181,8 @@ def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndar -------- Series.duplicated : Equivalent method on pandas.Series. DataFrame.duplicated : Equivalent method on pandas.DataFrame. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep on ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) """ # TODO: SNOW-1458147 implement duplicated - WarningMessage.index_to_pandas_warning("duplicated") - return self.to_pandas().duplicated(keep=keep) def equals(self, other: Any) -> bool: """ @@ -2246,6 +2213,7 @@ def sort_values( builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. + This parameter is not yet supported. Returns ------- @@ -2380,6 +2348,7 @@ def intersection(self, other: Any, sort: bool = False) -> Index: ) ) + @index_not_implemented() def union(self, other: Any, sort: bool = False) -> Index: """ Form the union of two Index objects. @@ -2407,30 +2376,11 @@ def union(self, other: Any, sort: bool = False) -> Index: ------- Index The Index that represents the union between the two indexes - - Examples - -------- - Union matching dtypes - - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Index([1, 2, 3, 4, 5, 6], dtype='int64') - - Union mismatched dtypes - - >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) - >>> idx2 = pd.Index([1, 2, 3, 4]) - >>> idx1.union(idx2) - Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') """ # TODO: SNOW-1458149 implement union w/o sort # TODO: SNOW-1468240 implement union w/ sort - WarningMessage.index_to_pandas_warning("union") - return self.__constructor__( - self.to_pandas().union(other=try_convert_index_to_native(other), sort=sort) - ) + @index_not_implemented() def difference(self, other: Any, sort: Any = None) -> Index: """ Return a new Index with elements of index not in `other`. @@ -2454,22 +2404,10 @@ def difference(self, other: Any, sort: Any = None) -> Index: ------- Index An index object that represents the difference between the two indexes. - - Examples - -------- - >>> idx1 = pd.Index([2, 1, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.difference(idx2) - Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Index([2, 1], dtype='int64') """ # TODO: SNOW-1458152 implement difference - WarningMessage.index_to_pandas_warning("difference") - return self.__constructor__( - self.to_pandas().difference(try_convert_index_to_native(other), sort=sort) - ) + @index_not_implemented() def get_indexer_for(self, target: Any) -> Any: """ Guaranteed return of an indexer even when non-unique. @@ -2481,13 +2419,6 @@ def get_indexer_for(self, target: Any) -> Any: ------- np.ndarray[np.intp] List of indices. - - Examples - -------- - Note Snowpark pandas converts np.nan, pd.NA, pd.NaT to None - >>> idx = pd.Index([np.nan, 'var1', np.nan]) - >>> idx.get_indexer_for([None]) - array([0, 2]) """ WarningMessage.index_to_pandas_warning("get_indexer_for") return self.to_pandas().get_indexer_for(target=target) @@ -2500,6 +2431,7 @@ def _get_indexer_strict(self, key: Any, axis_name: str) -> tuple[Index, np.ndarr tup = self.to_pandas()._get_indexer_strict(key=key, axis_name=axis_name) return self.__constructor__(tup[0]), tup[1] + @index_not_implemented() def get_level_values(self, level: int | str) -> Index: """ Return an Index of values for requested level. @@ -2520,17 +2452,6 @@ def get_level_values(self, level: int | str) -> Index: Notes ----- For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') - - Get level values by supplying `level` as integer: - - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') """ WarningMessage.index_to_pandas_warning("get_level_values") return self.__constructor__(self.to_pandas().get_level_values(level=level)) @@ -2576,6 +2497,7 @@ def isin(self) -> None: """ # TODO: SNOW-1458153 implement isin + @index_not_implemented() def slice_indexer( self, start: Hashable | None = None, @@ -2608,14 +2530,6 @@ def slice_indexer( Notes ----- This function assumes that the data is sorted, so use at your own peril - - Examples - -------- - This is a method on all index types. For example you can do: - - >>> idx = pd.Index(list('abcd')) - >>> idx.slice_indexer(start='b', end='c') - slice(1, 3, None) """ WarningMessage.index_to_pandas_warning("slice_indexer") return self.to_pandas().slice_indexer(start=start, end=end, step=step) @@ -2816,7 +2730,6 @@ def str(self) -> native_pd.core.strings.accessor.StringMethods: 0 AStrSeries dtype: object """ - WarningMessage.index_to_pandas_warning("str") return self.to_pandas().str def _to_datetime( diff --git a/tests/integ/modin/frame/test_dropna.py b/tests/integ/modin/frame/test_dropna.py index d77c65d055e..c273c225edf 100644 --- a/tests/integ/modin/frame/test_dropna.py +++ b/tests/integ/modin/frame/test_dropna.py @@ -84,7 +84,7 @@ def test_axis_1_not_implemented(test_dropna_df): df.dropna(axis="columns") -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_dropna_negative(test_dropna_df): eval_snowpark_pandas_result( pd.DataFrame(test_dropna_df), @@ -122,14 +122,12 @@ def test_dropna_negative(test_dropna_df): expect_exception_match="['invalid']", ) - eval_snowpark_pandas_result( - pd.DataFrame(test_dropna_df), - test_dropna_df, - lambda df: df.dropna(subset=["invalid"], axis=1), - expect_exception=True, - expect_exception_type=KeyError, - expect_exception_match="['invalid']", - ) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas dropna API doesn't yet support axis == 1", + ): + df = pd.DataFrame(test_dropna_df) + df.dropna(subset=["invalid"], axis=1) @pytest.mark.parametrize( diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py index 8f6f5b9f59d..8d0434915ac 100644 --- a/tests/integ/modin/index/test_index_methods.py +++ b/tests/integ/modin/index/test_index_methods.py @@ -56,12 +56,16 @@ def test_df_index_copy(native_df): assert_index_equal(snow_df.columns, new_columns) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA[2:]) def test_index_drop(native_index): snow_index = pd.Index(native_index) labels = [native_index[0]] - assert_index_equal(snow_index.drop(labels), native_index.drop(labels)) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.drop", + ): + assert_index_equal(snow_index.drop(labels), native_index.drop(labels)) @sql_count_checker(query_count=3, join_count=1) @@ -79,26 +83,33 @@ def test_df_index_equals(native_df): assert snow_df.index.equals(native_df.index) -@sql_count_checker(query_count=8) +@sql_count_checker(query_count=0) def test_index_union(): idx1 = pd.Index([1, 2, 3, 4]) idx2 = pd.Index([3, 4, 5, 6]) - union = idx1.union(idx2) - assert_index_equal(union, pd.Index([1, 2, 3, 4, 5, 6], dtype="int64")) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.union", + ): + idx1.union(idx2) idx1 = pd.Index(["a", "b", "c", "d"]) idx2 = pd.Index([1, 2, 3, 4]) - union = idx1.union(idx2) - assert_index_equal( - union, pd.Index(["a", "b", "c", "d", 1, 2, 3, 4], dtype="object") - ) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.union", + ): + idx1.union(idx2) -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=0) def test_index_difference(): idx1 = pd.Index([2, 1, 3, 4]) idx2 = pd.Index([3, 4, 5, 6]) - diff = idx1.difference(idx2) - assert_index_equal(diff, pd.Index([1, 2], dtype="int64")) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.difference", + ): + idx1.difference(idx2) @sql_count_checker(query_count=4) @@ -109,20 +120,32 @@ def test_index_intersection(): assert_index_equal(diff, pd.Index([3, 4], dtype="int64")) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("native_index", NATIVE_INDEX_TEST_DATA) def test_index_get_level_values(native_index): snow_index = pd.Index(native_index) - assert_index_equal(snow_index.get_level_values(0), snow_index) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.get_level_values", + ): + assert_index_equal(snow_index.get_level_values(0), snow_index) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_slice_indexer(): idx = pd.Index(list("abcd")) - s = idx.slice_indexer(start="a", end="c") - assert s != slice(1, 3, None) - s = idx.slice_indexer(start="b", end="c") - assert s == slice(1, 3, None) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.slice_indexer", + ): + s = idx.slice_indexer(start="a", end="c") + assert s != slice(1, 3, None) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas does not yet support the method Index.slice_indexer", + ): + s = idx.slice_indexer(start="b", end="c") + assert s == slice(1, 3, None) @sql_count_checker(query_count=1) diff --git a/tests/integ/modin/test_unimplemented.py b/tests/integ/modin/test_unimplemented.py index deb5bce6af1..d9fdf04c78b 100644 --- a/tests/integ/modin/test_unimplemented.py +++ b/tests/integ/modin/test_unimplemented.py @@ -178,6 +178,13 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None: # unsupported methods for Index UNSUPPORTED_INDEX_METHODS = [ + lambda idx: idx.duplicated(), + lambda idx: idx.drop(), + lambda idx: idx.union(), + lambda idx: idx.difference(), + lambda idx: idx.get_indexer_for(), + lambda idx: idx.get_level_values(), + lambda idx: idx.slice_indexer(), lambda idx: idx.nbytes(), lambda idx: idx.memory_usage(), lambda idx: idx.delete(), From ee8df557acd80fa9e5032045647dfdc298e1286e Mon Sep 17 00:00:00 2001 From: Jianzhun Du <68252326+sfc-gh-jdu@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:02:41 -0700 Subject: [PATCH 4/4] Cherry-pick release v1.21.1 note back to main (#2232) 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes SNOW-NNNNNNN 2. Fill out the following pre-review checklist: - [ ] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. 3. Please describe how your code solves the related issue. Please write a short description of how your code change solves the related issue. --- CHANGELOG.md | 9 ++++++++- recipe/meta.yaml | 2 +- src/snowflake/snowpark/version.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d4e56f50ab..fea42391259 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,7 +48,6 @@ - Fixed a bug in `session.get_session_stage` that referenced a non-existing stage after switching database or schema. - Fixed a bug where calling `DataFrame.to_snowpark_pandas_dataframe` without explicitly initializing the Snowpark pandas plugin caused an error. - Fixed a bug where using the `explode` function in dynamic table creation caused a SQL compilation error due to improper boolean type casting on the `outer` parameter. -- Fixed a bug where using `to_pandas_batches` with async jobs caused an error due to improper handling of waiting for asynchronous query completion. ### Snowpark Local Testing Updates @@ -132,6 +131,14 @@ - When calling `DataFrame.set_index`, or setting `DataFrame.index` or `Series.index`, with a new index that does not match the current length of the `Series`/`DataFrame` object, a `ValueError` is no longer raised. When the `Series`/`DataFrame` object is longer than the new index, the `Series`/`DataFrame`'s new index is filled with `NaN` values for the "extra" elements. When the `Series`/`DataFrame` object is shorter than the new index, the extra values in the new index are ignored—`Series` and `DataFrame` stay the same length `n`, and use only the first `n` values of the new index. +## 1.21.1 (2024-09-05) + +### Snowpark Python API Updates + +#### Bug Fixes + +- Fixed a bug where using `to_pandas_batches` with async jobs caused an error due to improper handling of waiting for asynchronous query completion. + ## 1.21.0 (2024-08-19) ### Snowpark Python API Updates diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 275c6602aeb..cf1f2c9ad70 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "snowflake-snowpark-python" %} -{% set version = "1.21.0" %} +{% set version = "1.21.1" %} package: name: {{ name|lower }} diff --git a/src/snowflake/snowpark/version.py b/src/snowflake/snowpark/version.py index adb2095ce5a..3955dbbbf33 100644 --- a/src/snowflake/snowpark/version.py +++ b/src/snowflake/snowpark/version.py @@ -4,4 +4,4 @@ # # Update this for the versions -VERSION = (1, 21, 0) +VERSION = (1, 21, 1)