Skip to content

Commit

Permalink
SNOW-1119855: Remove modin/pandas/base.py (2/2) (#2167)
Browse files Browse the repository at this point in the history
<!---
Please answer these questions before creating your pull request. Thanks!
--->

1. Which Jira issue is this PR addressing? Make sure that there is an
accompanying issue to your PR.

   <!---
   In this section, please add a Snowflake Jira issue number.
   
Note that if a corresponding GitHub issue exists, you should still
include
   the Snowflake Jira issue number. For example, for GitHub issue
#1400, you should
   add "SNOW-1335071" here.
    --->

   Fixes SNOW-1119855

2. Fill out the following pre-review checklist:

- [ ] I am adding a new automated test(s) to verify correctness of my
new code
- [ ] If this test skips Local Testing mode, I'm requesting review from
@snowflakedb/local-testing
   - [ ] I am adding new logging messages
   - [ ] I am adding a new telemetry message
   - [ ] I am adding new credentials
   - [ ] I am adding a new dependency
- [ ] If this is a new feature/behavior, I'm adding the Local Testing
parity changes.

3. Please describe how your code solves the related issue.

This followup to #2059 removes our vendored copy of `base.py` altogether
from the codebase. Many methods are still overridden in
`snowflake/snowpark/modin/plugin/extensions/base_overrides.py`, with the
reason for overriding each given inline in the comments. These methods
are as follows:
<details>
<summary>list</summary>
<ul>
<li>agg/aggregate</li>
<li>_agg_helper</li>
<li>various aggregations (count, max, min, mean, median, std, sum,
var)</li>
<li>_binary_op</li>
<li>_dropna</li>
<li>fillna</li>
<li>isin</li>
<li>quantile</li>
<li>_to_series_list</li>
<li>shift</li>
<li>skew</li>
<li>resample</li>
<li>expanding</li>
<li>rolling</li>
<li>indexer properties (iloc, loc, iat, at)</li>
<li>__getitem__</li>
<li>sort_values</li>
<li>where</li>
<li>mask</li>
<li>to_csv</li>
<li>sample</li>
<li>pct_change</li>
<li>astype</li>
<li>drop</li>
<li>__len__</li>
<li>set_axis</li>
<li>describe</li>
<li>diff</li>
<li>tail</li>
<li>idxmax</li>
<li>idxmin</li>
<li>unary operators (abs, __invert__, __neg__)</li>
<li>rename_axis</li>
<li>__array__ufunc__</li>
<li>reindex</li>
<li>_get_index</li>
<li>_set_index</li>
</ul>
</details>

Some of these differences can be upstreamed fairly easily, and we will
work to upstream them once the updated modin build process for Snowflake
becomes clearer. Some methods will require significantly more work to
reconcile.
  • Loading branch information
sfc-gh-joshi authored Aug 30, 2024
1 parent 60ec43a commit 25c1006
Show file tree
Hide file tree
Showing 19 changed files with 2,298 additions and 101 deletions.
79 changes: 71 additions & 8 deletions src/snowflake/snowpark/modin/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,16 @@
timedelta_range,
)

import modin.pandas

# TODO: SNOW-851745 make sure add all Snowpark pandas API general functions
from modin.pandas import plotting # type: ignore[import]

from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.api.extensions import (
register_dataframe_accessor,
register_series_accessor,
)
from snowflake.snowpark.modin.pandas.dataframe import _DATAFRAME_EXTENSIONS_, DataFrame
from snowflake.snowpark.modin.pandas.general import (
concat,
crosstab,
Expand Down Expand Up @@ -140,15 +146,15 @@
read_xml,
to_pickle,
)
from snowflake.snowpark.modin.pandas.series import Series
from snowflake.snowpark.modin.pandas.series import _SERIES_EXTENSIONS_, Series
from snowflake.snowpark.modin.plugin._internal.session import SnowpandasSessionHolder
from snowflake.snowpark.modin.plugin._internal.telemetry import (
try_add_telemetry_to_attribute,
)

# The extensions assigned to this module
_PD_EXTENSIONS_: dict = {}

# base needs to be re-exported in order to properly override docstrings for BasePandasDataset
# moving this import higher prevents sphinx from building documentation (??)
from snowflake.snowpark.modin.pandas import base # isort: skip # noqa: E402,F401

import snowflake.snowpark.modin.plugin.extensions.pd_extensions as pd_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.pd_overrides # isort: skip # noqa: E402,F401
Expand All @@ -157,12 +163,71 @@
DatetimeIndex,
TimedeltaIndex,
)

# this must occur before overrides are applied
_attrs_defined_on_modin_base = set(dir(modin.pandas.base.BasePandasDataset))
_attrs_defined_on_series = set(
dir(Series)
) # TODO: SNOW-1063347 revisit when series.py is removed
_attrs_defined_on_dataframe = set(
dir(DataFrame)
) # TODO: SNOW-1063346 revisit when dataframe.py is removed

# base overrides occur before subclass overrides in case subclasses override a base method
import snowflake.snowpark.modin.plugin.extensions.base_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.base_overrides # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.dataframe_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.dataframe_overrides # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.series_extensions # isort: skip # noqa: E402,F401
import snowflake.snowpark.modin.plugin.extensions.series_overrides # isort: skip # noqa: E402,F401

# For any method defined on Series/DF, add telemetry to it if it meets all of the following conditions:
# 1. The method was defined directly on upstream BasePandasDataset (_attrs_defined_on_modin_base)
# 2. The method is not overridden by a child class (this will change)
# 3. The method is not overridden by an extensions module
# 4. The method name does not start with an _
#
# TODO: SNOW-1063347
# Since we still use the vendored version of Series and the overrides for the top-level
# namespace haven't been performed yet, we need to set properties on the vendored version
_base_telemetry_added_attrs = set()

_series_ext = _SERIES_EXTENSIONS_.copy()
for attr_name in dir(Series):
if (
attr_name in _attrs_defined_on_modin_base
and attr_name in _attrs_defined_on_series
and attr_name not in _series_ext
and not attr_name.startswith("_")
):
register_series_accessor(attr_name)(
try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name))
)
_base_telemetry_added_attrs.add(attr_name)

# TODO: SNOW-1063346
# Since we still use the vendored version of DataFrame and the overrides for the top-level
# namespace haven't been performed yet, we need to set properties on the vendored version
_dataframe_ext = _DATAFRAME_EXTENSIONS_.copy()
for attr_name in dir(DataFrame):
if (
attr_name in _attrs_defined_on_modin_base
and attr_name in _attrs_defined_on_dataframe
and attr_name not in _dataframe_ext
and not attr_name.startswith("_")
):
# If telemetry was already added via Series, register the override but don't re-wrap
# the method in the telemetry annotation. If we don't do this check, we will end up
# double-reporting telemetry on some methods.
original_attr = getattr(DataFrame, attr_name)
new_attr = (
original_attr
if attr_name in _base_telemetry_added_attrs
else try_add_telemetry_to_attribute(attr_name, original_attr)
)
register_dataframe_accessor(attr_name)(new_attr)
_base_telemetry_added_attrs.add(attr_name)


def __getattr__(name: str) -> Any:
"""
Expand Down Expand Up @@ -220,7 +285,6 @@ def __getattr__(name: str) -> Any:
"date_range",
"Index",
"MultiIndex",
"Series",
"bdate_range",
"period_range",
"DatetimeIndex",
Expand Down Expand Up @@ -318,8 +382,7 @@ def __getattr__(name: str) -> Any:
# Manually re-export the members of the pd_extensions namespace, which are not declared in __all__.
_EXTENSION_ATTRS = ["read_snowflake", "to_snowflake", "to_snowpark", "to_pandas"]
# We also need to re-export native_pd.offsets, since modin.pandas doesn't re-export it.
# snowflake.snowpark.pandas.base also needs to be re-exported to make docstring overrides for BasePandasDataset work.
_ADDITIONAL_ATTRS = ["offsets", "base"]
_ADDITIONAL_ATTRS = ["offsets"]

# This code should eventually be moved into the `snowflake.snowpark.modin.plugin` module instead.
# Currently, trying to do so would result in incorrect results because `snowflake.snowpark.modin.pandas`
Expand Down
6 changes: 4 additions & 2 deletions src/snowflake/snowpark/modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import numpy as np
import pandas
from modin.pandas.accessor import CachedAccessor, SparseFrameAccessor
from modin.pandas.base import BasePandasDataset

# from . import _update_engine
from modin.pandas.iterator import PartitionIterator
Expand Down Expand Up @@ -73,7 +74,6 @@
from pandas.util._validators import validate_bool_kwarg

from snowflake.snowpark.modin import pandas as pd
from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset
from snowflake.snowpark.modin.pandas.groupby import (
DataFrameGroupBy,
validate_groupby_args,
Expand All @@ -91,12 +91,14 @@
replace_external_data_keys_with_empty_pandas_series,
replace_external_data_keys_with_query_compiler,
)
from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated
from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
from snowflake.snowpark.modin.plugin.utils.error_message import (
ErrorMessage,
dataframe_not_implemented,
)
from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP
from snowflake.snowpark.modin.plugin.utils.warning_message import (
SET_DATAFRAME_ATTRIBUTE_WARNING,
WarningMessage,
Expand Down Expand Up @@ -136,7 +138,7 @@
],
apilink="pandas.DataFrame",
)
class DataFrame(BasePandasDataset):
class DataFrame(BasePandasDataset, metaclass=TelemetryMeta):
_pandas_class = pandas.DataFrame

def __init__(
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import numpy as np
import pandas
import pandas.core.common as common
from modin.pandas.base import BasePandasDataset
from pandas import IntervalIndex, NaT, Timedelta, Timestamp
from pandas._libs import NaTType, lib
from pandas._libs.tslibs import to_offset
Expand Down Expand Up @@ -61,7 +62,6 @@

# add this line to make doctests runnable
from snowflake.snowpark.modin import pandas as pd # noqa: F401
from snowflake.snowpark.modin.pandas.base import BasePandasDataset
from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.series import Series
from snowflake.snowpark.modin.pandas.utils import (
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import numpy as np
import pandas
from modin.pandas.base import BasePandasDataset
from pandas._libs.tslibs import Resolution, parsing
from pandas._typing import AnyArrayLike, Scalar
from pandas.api.types import is_bool, is_list_like
Expand All @@ -58,7 +59,6 @@

import snowflake.snowpark.modin.pandas as pd
import snowflake.snowpark.modin.pandas.utils as frontend_utils
from snowflake.snowpark.modin.pandas.base import BasePandasDataset
from snowflake.snowpark.modin.pandas.dataframe import DataFrame
from snowflake.snowpark.modin.pandas.series import (
SERIES_SETITEM_LIST_LIKE_KEY_AND_RANGE_LIKE_VALUE_ERROR_MESSAGE,
Expand Down
6 changes: 4 additions & 2 deletions src/snowflake/snowpark/modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import numpy.typing as npt
import pandas
from modin.pandas.accessor import CachedAccessor, SparseAccessor
from modin.pandas.base import BasePandasDataset
from modin.pandas.iterator import PartitionIterator
from pandas._libs.lib import NoDefault, is_integer, no_default
from pandas._typing import (
Expand All @@ -51,17 +52,18 @@
from pandas.core.series import _coerce_method
from pandas.util._validators import validate_bool_kwarg

from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset
from snowflake.snowpark.modin.pandas.utils import (
from_pandas,
is_scalar,
try_convert_index_to_native,
)
from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
from snowflake.snowpark.modin.plugin.utils.error_message import (
ErrorMessage,
series_not_implemented,
)
from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP
from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage
from snowflake.snowpark.modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
Expand Down Expand Up @@ -108,7 +110,7 @@
],
apilink="pandas.Series",
)
class Series(BasePandasDataset):
class Series(BasePandasDataset, metaclass=TelemetryMeta):
_pandas_class = pandas.Series
__array_priority__ = pandas.Series.__array_priority__

Expand Down
3 changes: 1 addition & 2 deletions src/snowflake/snowpark/modin/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,9 @@ def is_scalar(obj):
bool
True if given object is scalar and False otherwise.
"""
from modin.pandas.base import BasePandasDataset
from pandas.api.types import is_scalar as pandas_is_scalar

from .base import BasePandasDataset

return not isinstance(obj, BasePandasDataset) and pandas_is_scalar(obj)


Expand Down
26 changes: 17 additions & 9 deletions src/snowflake/snowpark/modin/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,23 @@
import modin.utils # type: ignore[import] # isort: skip # noqa: E402
import modin.pandas.series_utils # type: ignore[import] # isort: skip # noqa: E402

modin.utils._inherit_docstrings(
docstrings.series_utils.StringMethods,
overwrite_existing=True,
)(modin.pandas.series_utils.StringMethods)

modin.utils._inherit_docstrings(
docstrings.series_utils.CombinedDatetimelikeProperties,
overwrite_existing=True,
)(modin.pandas.series_utils.DatetimeProperties)
# TODO: SNOW-1643979 pull in fixes for
# https://github.com/modin-project/modin/issues/7113 and https://github.com/modin-project/modin/issues/7134
# Upstream Modin has issues with certain docstring generation edge cases, so we should use our version instead
_inherit_docstrings = snowflake.snowpark.modin.utils._inherit_docstrings

inherit_modules = [
(docstrings.base.BasePandasDataset, modin.pandas.base.BasePandasDataset),
(docstrings.series_utils.StringMethods, modin.pandas.series_utils.StringMethods),
(
docstrings.series_utils.CombinedDatetimelikeProperties,
modin.pandas.series_utils.DatetimeProperties,
),
]

for (doc_module, target_object) in inherit_modules:
_inherit_docstrings(doc_module, overwrite_existing=True)(target_object)


# Don't warn the user about our internal usage of private preview pivot
# features. The user should have already been warned that Snowpark pandas
Expand Down
83 changes: 44 additions & 39 deletions src/snowflake/snowpark/modin/plugin/_internal/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,49 @@ def wrap(*args, **kwargs): # type: ignore
}


def try_add_telemetry_to_attribute(attr_name: str, attr_value: Any) -> Any:
"""
Attempts to add telemetry to an attribute.
If the attribute is callable with name in TELEMETRY_PRIVATE_METHODS, or is a callable that
starts with an underscore, the original attribute will be returned as-is. Otherwise, a version
of the method/property annotated with Snowpark pandas telemetry is returned.
"""
if callable(attr_value) and (
not attr_name.startswith("_") or (attr_name in TELEMETRY_PRIVATE_METHODS)
):
return snowpark_pandas_telemetry_method_decorator(attr_value)
elif isinstance(attr_value, property):
# wrap on getter and setter
return property(
snowpark_pandas_telemetry_method_decorator(
cast(
# add a cast because mypy doesn't recognize that
# non-None fget and __get__ are both callable
# arguments to snowpark_pandas_telemetry_method_decorator.
Callable,
attr_value.__get__ # pragma: no cover: we don't encounter this case in pandas or modin because every property has an fget method.
if attr_value.fget is None
else attr_value.fget,
),
property_name=attr_name,
property_method_type=PropertyMethodType.FGET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__set__ if attr_value.fset is None else attr_value.fset,
property_name=attr_name,
property_method_type=PropertyMethodType.FSET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__delete__ if attr_value.fdel is None else attr_value.fdel,
property_name=attr_name,
property_method_type=PropertyMethodType.FDEL,
),
doc=attr_value.__doc__,
)
return attr_value


class TelemetryMeta(type):
def __new__(
cls, name: str, bases: tuple, attrs: dict[str, Any]
Expand Down Expand Up @@ -536,43 +579,5 @@ def __new__(
The modified class with decorated methods.
"""
for attr_name, attr_value in attrs.items():
if callable(attr_value) and (
not attr_name.startswith("_")
or (attr_name in TELEMETRY_PRIVATE_METHODS)
):
attrs[attr_name] = snowpark_pandas_telemetry_method_decorator(
attr_value
)
elif isinstance(attr_value, property):
# wrap on getter and setter
attrs[attr_name] = property(
snowpark_pandas_telemetry_method_decorator(
cast(
# add a cast because mypy doesn't recognize that
# non-None fget and __get__ are both callable
# arguments to snowpark_pandas_telemetry_method_decorator.
Callable,
attr_value.__get__ # pragma: no cover: we don't encounter this case in pandas or modin because every property has an fget method.
if attr_value.fget is None
else attr_value.fget,
),
property_name=attr_name,
property_method_type=PropertyMethodType.FGET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__set__
if attr_value.fset is None
else attr_value.fset,
property_name=attr_name,
property_method_type=PropertyMethodType.FSET,
),
snowpark_pandas_telemetry_method_decorator(
attr_value.__delete__
if attr_value.fdel is None
else attr_value.fdel,
property_name=attr_name,
property_method_type=PropertyMethodType.FDEL,
),
doc=attr_value.__doc__,
)
attrs[attr_name] = try_add_telemetry_to_attribute(attr_name, attr_value)
return type.__new__(cls, name, bases, attrs)
Loading

0 comments on commit 25c1006

Please sign in to comment.