Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1852934: Add support for DataFrame.map method #2754

Merged
merged 3 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Added support for `Series.str.center`.
- Added support for `Series.str.pad`.
- Added support for applying Snowpark Python function `snowflake_cortex_sentiment`.
- Added support for `DataFrame.map`.

#### Improvements
- Improve performance of `DataFrame.map`, `Series.apply` and `Series.map` methods by mapping numpy functions to snowpark functions if possible.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``lt`` | P | ``level`` | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``map`` | N | | |
| ``map`` | P | | ``N`` if ``na_action == "ignore"`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``mask`` | P | | ``N`` if given ``axis`` when ``other`` is a |
| | | | ``DataFrame`` or ``level`` parameters; |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,15 @@ def decorator(base_method: Any):

# Avoid overwriting builtin `map` by accident
@register_dataframe_accessor("map")
@dataframe_not_implemented()
def _map(self, func, na_action: str | None = None, **kwargs) -> DataFrame:
pass # pragma: no cover
def _map(self, func: PythonFuncType, na_action: str | None = None, **kwargs):
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this ticket SNOW-1063346 (https://snowflakecomputing.atlassian.net/browse/SNOW-1063346) is done. what's the TODO here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this line of comment is copied over and it is no longer valid since it is done

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good question. @sfc-gh-joshi might have insights into what work is left to remove these TODOs?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sfc-gh-dpetersohn originally left these comments in the code to identify which functions have differing implementations from the upstream modin versions, but I ended up using the ticket to track moving these methods into dataframe_overrides.py from the old snowflake/snowpark/modin/pandas/dataframe.py file. We can remove the TODO when we remove the override definition (i.e. we match the upstream modin implementation), though this might not be possible or desirable in all cases.

if not callable(func):
raise TypeError(f"{func} is not callable") # pragma: no cover
return self.__constructor__(
query_compiler=self._query_compiler.applymap(
func, na_action=na_action, **kwargs
)
)


@register_dataframe_not_implemented()
Expand Down Expand Up @@ -804,14 +810,12 @@ def apply(
# Snowpark pandas uses a separate QC method, while modin directly calls map.
@register_dataframe_accessor("applymap")
def applymap(self, func: PythonFuncType, na_action: str | None = None, **kwargs):
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
if not callable(func):
raise TypeError(f"{func} is not callable")
return self.__constructor__(
query_compiler=self._query_compiler.applymap(
func, na_action=na_action, **kwargs
)
warnings.warn(
"DataFrame.applymap has been deprecated. Use DataFrame.map instead.",
FutureWarning,
stacklevel=2,
)
return self.map(func, na_action=na_action, **kwargs)


# We need to override _get_columns to satisfy
Expand Down
26 changes: 17 additions & 9 deletions tests/integ/modin/frame/test_applymap.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,33 @@
)


@pytest.fixture(params=["applymap", "map"])
def method(request):
"""
method name to test.
"""
return request.param


@pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP)
@sql_count_checker(query_count=7, udf_count=1)
def test_applymap_basic_without_type_hints(data, func, return_type):
def test_applymap_basic_without_type_hints(data, func, return_type, method):
sfc-gh-nkumar marked this conversation as resolved.
Show resolved Hide resolved
frame_data = {0: data, 1: data}
native_df = native_pd.DataFrame(frame_data)
snow_df = pd.DataFrame(frame_data)
eval_snowpark_pandas_result(snow_df, native_df, lambda x: x.applymap(func))
eval_snowpark_pandas_result(snow_df, native_df, lambda x: getattr(x, method)(func))


@pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP)
@sql_count_checker(query_count=7, udf_count=1)
def test_applymap_basic_with_type_hints(data, func, return_type):
def test_applymap_basic_with_type_hints(data, func, return_type, method):
func_with_type_hint = create_func_with_return_type_hint(func, return_type)

frame_data = {0: data, 1: data}
native_df = native_pd.DataFrame(frame_data)
snow_df = pd.DataFrame(frame_data)
eval_snowpark_pandas_result(
snow_df, native_df, lambda x: x.applymap(func_with_type_hint)
snow_df, native_df, lambda x: getattr(x, method)(func_with_type_hint)
)


Expand Down Expand Up @@ -112,27 +120,27 @@ def test_applymap_numpy(func):


@sql_count_checker(query_count=0)
def test_applymap_na_action_ignore():
def test_applymap_na_action_ignore(method):
snow_df = pd.DataFrame([1, 1.1, "NaN", None], dtype="Float64")
msg = "Snowpark pandas applymap API doesn't yet support na_action == 'ignore'"
with pytest.raises(NotImplementedError, match=msg):
snow_df.applymap(lambda x: x is None, na_action="ignore")
getattr(snow_df, method)(lambda x: x is None, na_action="ignore")

data = ["cat", "dog", np.nan, "rabbit"]
snow_df = pd.DataFrame(data)
with pytest.raises(NotImplementedError, match=msg):
snow_df.applymap("I am a {}".format, na_action="ignore")
getattr(snow_df, method)("I am a {}".format, na_action="ignore")


@pytest.mark.parametrize("invalid_input", ["min", [np.min], {"a": np.max}])
@sql_count_checker(query_count=0)
def test_applymap_invalid_input(invalid_input):
def test_applymap_invalid_input(invalid_input, method):
snow_df = pd.DataFrame([1])
native_df = native_pd.DataFrame([1])
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda x: x.applymap(invalid_input),
lambda x: getattr(x, method)(invalid_input),
expect_exception=True,
expect_exception_match="is not callable",
assert_exception_equal=False,
Expand Down
Loading