Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1119855: Remove modin/pandas/base.py (2/2) #2167

Merged
merged 14 commits into from
Aug 30, 2024
79 changes: 71 additions & 8 deletions src/snowflake/snowpark/modin/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,16 @@
timedelta_range,
)

import modin.pandas

# TODO: SNOW-851745 make sure add all Snowpark pandas API general functions
from modin.pandas import plotting # type: ignore[import]

from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.api.extensions import (
register_dataframe_accessor,
register_series_accessor,
)
from snowflake.snowpark.modin.pandas.dataframe import _DATAFRAME_EXTENSIONS_, DataFrame
from snowflake.snowpark.modin.pandas.general import (
concat,
crosstab,
Expand Down Expand Up @@ -140,15 +146,15 @@
read_xml,
to_pickle,
)
from snowflake.snowpark.modin.pandas.series import Series
from snowflake.snowpark.modin.pandas.series import _SERIES_EXTENSIONS_, Series
from snowflake.snowpark.modin.plugin._internal.session import SnowpandasSessionHolder
from snowflake.snowpark.modin.plugin._internal.telemetry import (
try_add_telemetry_to_attribute,
)

# The extensions assigned to this module
_PD_EXTENSIONS_: dict = {}

# base needs to be re-exported in order to properly override docstrings for BasePandasDataset
# moving this import higher prevents sphinx from building documentation (??)
from snowflake.snowpark.modin.pandas import base # isort: skip # noqa: E402,F401

import snowflake.snowpark.modin.plugin.extensions.pd_extensions as pd_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.pd_overrides # isort: skip # noqa: E402,F401
Expand All @@ -157,12 +163,71 @@
DatetimeIndex,
TimedeltaIndex,
)

# this must occur before overrides are applied
_attrs_defined_on_modin_base = set(dir(modin.pandas.base.BasePandasDataset))
_attrs_defined_on_series = set(
dir(Series)
) # TODO: SNOW-1063347 revisit when series.py is removed
_attrs_defined_on_dataframe = set(
dir(DataFrame)
) # TODO: SNOW-1063346 revisit when dataframe.py is removed

# base overrides occur before subclass overrides in case subclasses override a base method
import snowflake.snowpark.modin.plugin.extensions.base_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.base_overrides # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.dataframe_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.dataframe_overrides # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.series_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.series_overrides # isort: skip # noqa: E402,F401

# For any method defined on Series/DF, add telemetry to it if it meets all of the following conditions:
# 1. The method was defined directly on upstream BasePandasDataset (_attrs_defined_on_modin_base)
# 2. The method is not overridden by a child class (this will change)
# 3. The method is not overridden by an extensions module
# 4. The method name does not start with an _
#
# TODO: SNOW-1063347
# Since we still use the vendored version of Series and the overrides for the top-level
# namespace haven't been performed yet, we need to set properties on the vendored version
_base_telemetry_added_attrs = set()

_series_ext = _SERIES_EXTENSIONS_.copy()
for attr_name in dir(Series):
if (
attr_name in _attrs_defined_on_modin_base
and attr_name in _attrs_defined_on_series
and attr_name not in _series_ext
and not attr_name.startswith("_")
):
register_series_accessor(attr_name)(
try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name))
)
_base_telemetry_added_attrs.add(attr_name)

# TODO: SNOW-1063346
# Since we still use the vendored version of DataFrame and the overrides for the top-level
# namespace haven't been performed yet, we need to set properties on the vendored version
_dataframe_ext = _DATAFRAME_EXTENSIONS_.copy()
for attr_name in dir(DataFrame):
if (
attr_name in _attrs_defined_on_modin_base
and attr_name in _attrs_defined_on_dataframe
and attr_name not in _dataframe_ext
and not attr_name.startswith("_")
):
# If telemetry was already added via Series, register the override but don't re-wrap
# the method in the telemetry annotation. If we don't do this check, we will end up
# double-reporting telemetry on some methods.
original_attr = getattr(DataFrame, attr_name)
new_attr = (
original_attr
if attr_name in _base_telemetry_added_attrs
else try_add_telemetry_to_attribute(attr_name, original_attr)
)
register_dataframe_accessor(attr_name)(new_attr)
_base_telemetry_added_attrs.add(attr_name)


def __getattr__(name: str) -> Any:
"""
Expand Down Expand Up @@ -220,7 +285,6 @@ def __getattr__(name: str) -> Any:
"date_range",
"Index",
"MultiIndex",
"Series",
sfc-gh-rdurrani marked this conversation as resolved.
Show resolved Hide resolved
"bdate_range",
"period_range",
"DatetimeIndex",
Expand Down Expand Up @@ -318,8 +382,7 @@ def __getattr__(name: str) -> Any:
# Manually re-export the members of the pd_extensions namespace, which are not declared in __all__.
_EXTENSION_ATTRS = ["read_snowflake", "to_snowflake", "to_snowpark", "to_pandas"]
# We also need to re-export native_pd.offsets, since modin.pandas doesn't re-export it.
# snowflake.snowpark.pandas.base also needs to be re-exported to make docstring overrides for BasePandasDataset work.
_ADDITIONAL_ATTRS = ["offsets", "base"]
sfc-gh-yzou marked this conversation as resolved.
Show resolved Hide resolved
_ADDITIONAL_ATTRS = ["offsets"]

# This code should eventually be moved into the `snowflake.snowpark.modin.plugin` module instead.
# Currently, trying to do so would result in incorrect results because `snowflake.snowpark.modin.pandas`
Expand Down
6 changes: 4 additions & 2 deletions src/snowflake/snowpark/modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import numpy as np
import pandas
from modin.pandas.accessor import CachedAccessor, SparseFrameAccessor
from modin.pandas.base import BasePandasDataset

# from . import _update_engine
from modin.pandas.iterator import PartitionIterator
Expand Down Expand Up @@ -73,7 +74,6 @@
from pandas.util._validators import validate_bool_kwarg

from snowflake.snowpark.modin import pandas as pd
from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset
from snowflake.snowpark.modin.pandas.groupby import (
DataFrameGroupBy,
validate_groupby_args,
Expand All @@ -91,12 +91,14 @@
replace_external_data_keys_with_empty_pandas_series,
replace_external_data_keys_with_query_compiler,
)
from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated
from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
from snowflake.snowpark.modin.plugin.utils.error_message import (
ErrorMessage,
dataframe_not_implemented,
)
from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP
from snowflake.snowpark.modin.plugin.utils.warning_message import (
SET_DATAFRAME_ATTRIBUTE_WARNING,
WarningMessage,
Expand Down Expand Up @@ -136,7 +138,7 @@
],
apilink="pandas.DataFrame",
)
class DataFrame(BasePandasDataset):
class DataFrame(BasePandasDataset, metaclass=TelemetryMeta):
_pandas_class = pandas.DataFrame

def __init__(
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import numpy as np
import pandas
import pandas.core.common as common
from modin.pandas.base import BasePandasDataset
from pandas import IntervalIndex, NaT, Timedelta, Timestamp
from pandas._libs import NaTType, lib
from pandas._libs.tslibs import to_offset
Expand Down Expand Up @@ -61,7 +62,6 @@

# add this line to make doctests runnable
from snowflake.snowpark.modin import pandas as pd # noqa: F401
from snowflake.snowpark.modin.pandas.base import BasePandasDataset
from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.series import Series
from snowflake.snowpark.modin.pandas.utils import (
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import numpy as np
import pandas
from modin.pandas.base import BasePandasDataset
from pandas._libs.tslibs import Resolution, parsing
from pandas._typing import AnyArrayLike, Scalar
from pandas.api.types import is_bool, is_list_like
Expand All @@ -58,7 +59,6 @@

import snowflake.snowpark.modin.pandas as pd
import snowflake.snowpark.modin.pandas.utils as frontend_utils
from snowflake.snowpark.modin.pandas.base import BasePandasDataset
from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.series import (
SERIES_SETITEM_LIST_LIKE_KEY_AND_RANGE_LIKE_VALUE_ERROR_MESSAGE,
Expand Down
6 changes: 4 additions & 2 deletions src/snowflake/snowpark/modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import numpy.typing as npt
import pandas
from modin.pandas.accessor import CachedAccessor, SparseAccessor
from modin.pandas.base import BasePandasDataset
from modin.pandas.iterator import PartitionIterator
from pandas._libs.lib import NoDefault, is_integer, no_default
from pandas._typing import (
Expand All @@ -51,17 +52,18 @@
from pandas.core.series import _coerce_method
from pandas.util._validators import validate_bool_kwarg

from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset
from snowflake.snowpark.modin.pandas.utils import (
from_pandas,
is_scalar,
try_convert_index_to_native,
)
from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
from snowflake.snowpark.modin.plugin.utils.error_message import (
ErrorMessage,
series_not_implemented,
)
from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP
from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage
from snowflake.snowpark.modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
Expand Down Expand Up @@ -108,7 +110,7 @@
],
apilink="pandas.Series",
)
class Series(BasePandasDataset):
class Series(BasePandasDataset, metaclass=TelemetryMeta):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems we will have to overwrite the class definition regardless, otherwise the telemetry will not work. Or is modin providing any telemetry infra for use? i recall @sfc-gh-azhan and @sfc-gh-nkrishna was doing something for telemetry. I don't think we need to block on this pr for this, let's follow up offline

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modin has its own telemetry mechanisms, but there's not really a good way for us to hook into it to send data back to Snowflake.

As far as I can tell, the TelemetryMeta class basically wraps every method/property of the class in a flavor of snowpark_pandas_telemetry_decorator, so once we fully use the modin frontend, we could monkeypatch those methods at runtime to have this behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i recall that you mentioned that we need to overwrite the class definition for Series, so that it can take snowpandas index as an input, if we also put metaclass=TelemetryMeta when overwrite the class definition, would it work?
if we do it as monkey patch later, how would it look like?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what I referred to was overriding the __init__ method of Series, not the actual class declaration itself. If we were to override the class declaration, it wouldn't be much different from what we're currently doing.

My thinking is that when we remove Series/DF, we do something like this:

for attr_name, attr_value in Series.__dict__:
    register_series_override(attr_name)(snowpark_pandas_telemetry_method(attr_value))

which would overwrite the existing method with a telemetry-enabled version.

_pandas_class = pandas.Series
__array_priority__ = pandas.Series.__array_priority__

Expand Down
3 changes: 1 addition & 2 deletions src/snowflake/snowpark/modin/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,9 @@ def is_scalar(obj):
bool
True if given object is scalar and False otherwise.
"""
from modin.pandas.base import BasePandasDataset
from pandas.api.types import is_scalar as pandas_is_scalar

from .base import BasePandasDataset

return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj)


Expand Down
26 changes: 17 additions & 9 deletions src/snowflake/snowpark/modin/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,23 @@
import modin.utils # type: ignore[import] # isort: skip # noqa: E402
import modin.pandas.series_utils # type: ignore[import] # isort: skip # noqa: E402

modin.utils._inherit_docstrings(
docstrings.series_utils.StringMethods,
overwrite_existing=True,
)(modin.pandas.series_utils.StringMethods)

modin.utils._inherit_docstrings(
docstrings.series_utils.CombinedDatetimelikeProperties,
overwrite_existing=True,
)(modin.pandas.series_utils.DatetimeProperties)
# TODO: SNOW-1643979 pull in fixes for
# https://github.com/modin-project/modin/issues/7113 and https://github.com/modin-project/modin/issues/7134
# Upstream Modin has issues with certain docstring generation edge cases, so we should use our version instead
sfc-gh-yzou marked this conversation as resolved.
Show resolved Hide resolved
_inherit_docstrings = snowflake.snowpark.modin.utils._inherit_docstrings

inherit_modules = [
(docstrings.base.BasePandasDataset, modin.pandas.base.BasePandasDataset),
(docstrings.series_utils.StringMethods, modin.pandas.series_utils.StringMethods),
(
docstrings.series_utils.CombinedDatetimelikeProperties,
modin.pandas.series_utils.DatetimeProperties,
),
]

for (doc_module, target_object) in inherit_modules:
_inherit_docstrings(doc_module, overwrite_existing=True)(target_object)


# Don't warn the user about our internal usage of private preview pivot
# features. The user should have already been warned that Snowpark pandas
Expand Down
83 changes: 44 additions & 39 deletions src/snowflake/snowpark/modin/plugin/_internal/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,49 @@ def wrap(*args, **kwargs): # type: ignore
}


def try_add_telemetry_to_attribute(attr_name: str, attr_value: Any) -> Any:
"""
Attempts to add telemetry to an attribute.

If the attribute is callable with name in TELEMETRY_PRIVATE_METHODS, or is a callable that
starts with an underscore, the original attribute will be returned as-is. Otherwise, a version
of the method/property annotated with Snowpark pandas telemetry is returned.
"""
if callable(attr_value) and (
not attr_name.startswith("_") or (attr_name in TELEMETRY_PRIVATE_METHODS)
):
return snowpark_pandas_telemetry_method_decorator(attr_value)
elif isinstance(attr_value, property):
# wrap on getter and setter
return property(
snowpark_pandas_telemetry_method_decorator(
cast(
# add a cast because mypy doesn't recognize that
# non-None fget and __get__ are both callable
# arguments to snowpark_pandas_telemetry_method_decorator.
Callable,
attr_value.__get__ # pragma: no cover: we don't encounter this case in pandas or modin because every property has an fget method.
if attr_value.fget is None
else attr_value.fget,
),
property_name=attr_name,
property_method_type=PropertyMethodType.FGET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__set__ if attr_value.fset is None else attr_value.fset,
property_name=attr_name,
property_method_type=PropertyMethodType.FSET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__delete__ if attr_value.fdel is None else attr_value.fdel,
property_name=attr_name,
property_method_type=PropertyMethodType.FDEL,
),
doc=attr_value.__doc__,
)
return attr_value


class TelemetryMeta(type):
def __new__(
cls, name: str, bases: tuple, attrs: dict[str, Any]
Expand Down Expand Up @@ -536,43 +579,5 @@ def __new__(
The modified class with decorated methods.
"""
for attr_name, attr_value in attrs.items():
if callable(attr_value) and (
not attr_name.startswith("_")
or (attr_name in TELEMETRY_PRIVATE_METHODS)
):
attrs[attr_name] = snowpark_pandas_telemetry_method_decorator(
attr_value
)
elif isinstance(attr_value, property):
# wrap on getter and setter
attrs[attr_name] = property(
snowpark_pandas_telemetry_method_decorator(
cast(
# add a cast because mypy doesn't recognize that
# non-None fget and __get__ are both callable
# arguments to snowpark_pandas_telemetry_method_decorator.
Callable,
attr_value.__get__ # pragma: no cover: we don't encounter this case in pandas or modin because every property has an fget method.
if attr_value.fget is None
else attr_value.fget,
),
property_name=attr_name,
property_method_type=PropertyMethodType.FGET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__set__
if attr_value.fset is None
else attr_value.fset,
property_name=attr_name,
property_method_type=PropertyMethodType.FSET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__delete__
if attr_value.fdel is None
else attr_value.fdel,
property_name=attr_name,
property_method_type=PropertyMethodType.FDEL,
),
doc=attr_value.__doc__,
)
attrs[attr_name] = try_add_telemetry_to_attribute(attr_name, attr_value)
return type.__new__(cls, name, bases, attrs)
Loading
Loading