diff --git a/CHANGELOG.md b/CHANGELOG.md index beb1e104468..05b56da1144 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -82,6 +82,7 @@ - Added support for index's arithmetic and comparison operators. - Added support for `Series.dt.round`. - Added documentation pages for `DatetimeIndex`. +- Added support for `Index.name`, `Index.names`, `Index.rename`, and `Index.set_names`. #### Improvements - Removed the public preview warning message upon importing Snowpark pandas. diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst index 8ab17661279..d0754a1c6cc 100644 --- a/docs/source/modin/supported/index_supported.rst +++ b/docs/source/modin/supported/index_supported.rst @@ -38,7 +38,7 @@ Attributes +-----------------------------+---------------------------------+----------------------------------------------------+ | ``name`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``names`` | P | | +| ``names`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nbytes`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ @@ -114,7 +114,7 @@ Methods | ``reindex`` | P | | ``N`` if the Index values are tuple-like, or | | | | | method is ``nearest``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``rename`` | N | | | +| ``rename`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``repeat`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -130,7 +130,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``value_counts`` | P | ``bins`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``set_names`` | N | | | +| ``set_names`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``droplevel`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 43ed5ee389f..f7e9da9f89a 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -674,7 +674,7 @@ def _get_index(self): return self._query_compiler.index idx = Index(query_compiler=self._query_compiler) - idx._parent = self + idx._set_parent(self) return idx index = property(_get_index, _set_index) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index f6e02034849..c1a70570001 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -30,10 +30,12 @@ import numpy as np import pandas as native_pd from pandas._libs import lib +from pandas._libs.lib import is_list_like, is_scalar from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_datetime64_any_dtype, pandas_dtype +from pandas.core.dtypes.inference import is_hashable from snowflake.snowpark.modin.pandas import DataFrame, Series from snowflake.snowpark.modin.pandas.base import BasePandasDataset @@ -74,7 +76,7 @@ def __new__( ) -> Index: """ Override __new__ method to control new instance creation of Index. - Depending on data type, it will create a Index or DatetimeIndex instance. + Depending on data type, it will create an Index or DatetimeIndex instance. Parameters ---------- @@ -177,6 +179,8 @@ def _init_index( query_compiler: SnowflakeQueryCompiler = None, **kwargs: Any, ): + # `_parent` keeps track of any Series or DataFrame that this Index is a part of. + self._parent = None if query_compiler: # Raise warning if `data` is query compiler with non-default arguments. for arg_name, arg_value in kwargs.items(): @@ -336,6 +340,12 @@ def __constructor__(self): """ return type(self) + def _set_parent(self, parent: Series | DataFrame): + """ + Set the parent object of the current Index to a given Series or DataFrame. + """ + self._parent = parent + @property def values(self) -> ArrayLike: """ @@ -612,7 +622,7 @@ def name(self) -> Hashable: Returns ------- Hashable - name of this index + Name of this index. Examples -------- @@ -629,7 +639,13 @@ def name(self, value: Hashable) -> None: """ Set Index name. """ + if not is_hashable(value): + raise TypeError(f"{type(self).__name__}.name must be a hashable type") self._query_compiler = self._query_compiler.set_index_names([value]) + if self._parent is not None: + self._parent._update_inplace( + new_query_compiler=self._parent._query_compiler.set_index_names([value]) + ) def _get_names(self) -> list[Hashable]: """ @@ -651,6 +667,10 @@ def _set_names(self, values: list) -> None: TypeError if each name is not hashable. """ self._query_compiler = self._query_compiler.set_index_names(values) + if self._parent is not None: + self._parent._update_inplace( + new_query_compiler=self._parent._query_compiler.set_index_names(values) + ) names = property(fset=_set_names, fget=_get_names) @@ -685,13 +705,23 @@ def set_names( >>> idx.set_names('quarter') Index([1, 2, 3, 4], dtype='int64', name='quarter') """ - # TODO: SNOW-1458122 implement set_names - WarningMessage.index_to_pandas_warning("set_names") - if not inplace: - return self.__constructor__( - self.to_pandas().set_names(names, level=level, inplace=inplace) + if is_list_like(names) and len(names) > 1: + raise ValueError( + f"Since Index is a single index object in Snowpark pandas, " + f"the length of new names must be 1, got {len(names)}." + ) + if level is not None and level not in [0, -1]: + raise IndexError( + f"Level does not exist: Index has only 1 level, {level} is not a valid level number." ) - return self.to_pandas().set_names(names, level=level, inplace=inplace) + if inplace: + name = names[0] if is_list_like(names) else names + self.name = name + return None + else: + res = self.__constructor__(query_compiler=self._query_compiler) + res.name = names if is_scalar(names) else names[0] + return res @property def ndim(self) -> int: @@ -1521,8 +1551,7 @@ def reindex( ) return Index(query_compiler=query_compiler), indices - @index_not_implemented() - def rename(self) -> None: + def rename(self, name: Any, inplace: bool = False) -> None: """ Alter Index or MultiIndex name. @@ -1545,8 +1574,29 @@ def rename(self) -> None: See Also -------- Index.set_names : Able to set new names partially and by level. + + Examples + -------- + >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') + >>> idx.rename('grade', inplace=False) + Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + >>> idx.rename('grade', inplace=True) + + Note + ---- + Native pandas only allows hashable types for names. Snowpark pandas allows + name to be any scalar or list-like type. If a tuple is used for the name, + the tuple itself will be the name. + + For instance, + >>> idx = pd.Index([1, 2, 3]) + >>> idx.rename(('a', 'b', 'c'), inplace=True) + >>> idx.name + ('a', 'b', 'c') """ - # TODO: SNOW-1458122 implement rename + if isinstance(name, tuple): + name = [name] # The entire tuple is the name + return self.set_names(names=name, inplace=inplace) def nunique(self, dropna: bool = True) -> int: """ diff --git a/tests/integ/modin/frame/test_nlargest_nsmallest.py b/tests/integ/modin/frame/test_nlargest_nsmallest.py index 3b6318179f2..1a2b13db7a5 100644 --- a/tests/integ/modin/frame/test_nlargest_nsmallest.py +++ b/tests/integ/modin/frame/test_nlargest_nsmallest.py @@ -54,7 +54,7 @@ def test_nlargest_nsmallest_large_n(snow_df, native_df, method): ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3) def test_nlargest_nsmallest_overlapping_index_name(snow_df, native_df, method): snow_df = snow_df.rename_axis("A") native_df = native_df.rename_axis("A") diff --git a/tests/integ/modin/index/test_name.py b/tests/integ/modin/index/test_name.py new file mode 100644 index 00000000000..0397ed1546e --- /dev/null +++ b/tests/integ/modin/index/test_name.py @@ -0,0 +1,319 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import assert_frame_equal + + +@sql_count_checker(query_count=0) +def test_index_parent_name(): + """ + Check whether the Index's parent's name is updated correctly. + Changing the index's name should also change the parent's name. + """ + native_idx1 = native_pd.Index(["A", "B"], name="xyz") + native_idx2 = native_pd.Index(["A", "B", "D", "E", "G", "H"], name="CFI") + + # DataFrame case. + df = pd.DataFrame([[1, 2], [3, 4]], index=native_idx1) + snow_idx1 = df.index + assert snow_idx1.name == df.index.name == "xyz" # compare original name + snow_idx1.name = "new_name 1" # set new name + assert snow_idx1.name == df.index.name == "new_name 1" # compare new name + + # Series case. + s = pd.Series([1, 2, 4, 5, 6, 7], index=native_idx2, name="zyx") + snow_idx2 = s.index + assert snow_idx2.name == s.index.name == "CFI" # compare original name + snow_idx2.name = "new_name 2" # set new name + assert snow_idx2.name == s.index.name == "new_name 2" # compare new name + + +@sql_count_checker(query_count=0) +def test_index_parent_names(): + """ + Check whether the Index's parent's name is updated correctly. + Changing the index's name should also change the parent's name. + """ + native_idx1 = native_pd.Index(["A", "B"], name="xyz") + native_idx2 = native_pd.Index(["A", "B", "D", "E", "G", "H"], name="CFI") + + # DataFrame case. + df = pd.DataFrame([[1, 2], [3, 4]], index=native_idx1) + snow_idx1 = df.index + assert snow_idx1.names == df.index.names == ["xyz"] # compare original names + snow_idx1.names = ["new_name"] # set new names + assert snow_idx1.names[0] == df.index.names[0] == "new_name" # compare new names + assert len(snow_idx1.names) == len(df.index.names) == 1 + + # Series case. + s = pd.Series([1, 2, 4, 5, 6, 7], index=native_idx2, name="zyx") + snow_idx2 = s.index + assert snow_idx2.names == s.index.names == ["CFI"] # compare original names + snow_idx2.names = ["new_name 2"] # set new names + assert snow_idx2.names == ["new_name 2"] # compare new names + assert snow_idx2.names == s.index.names + + +@pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) +@sql_count_checker(query_count=0) +def test_index_rename_inplace(new_name): + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + + # Rename the index in place. + native_res = native_idx.rename(new_name, inplace=True) + snow_res = snow_idx.rename(new_name, inplace=True) + + # Verify that the return value is None, and `name` and `names` match. + assert native_res is None + assert snow_res is None + assert native_idx.name == snow_idx.name == new_name + assert native_idx.names == snow_idx.names == [new_name] + + +@pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) +@sql_count_checker(query_count=0) +def test_index_rename_copy(new_name): + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + + # Rename the index and create a new index. + new_native_idx = native_idx.rename(new_name, inplace=False) + new_snow_idx = snow_idx.rename(new_name, inplace=False) + + # Verify that `name` and `names` match, and the original index's name is unchanged. + assert new_native_idx.name == new_snow_idx.name == new_name + assert new_native_idx.names == new_snow_idx.names == [new_name] + assert native_idx.name == snow_idx.name == "score" + + +@pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) +@sql_count_checker(query_count=1) +def test_df_index_rename_inplace(new_name): + # 1 query to create the DataFrame. + # Create the DataFrame and the new index. + native_idx = native_pd.Index(["A", "C"], name="score") + snow_idx = pd.Index(native_idx) + data = [[1, 2], [3, 4]] + native_df = native_pd.DataFrame(data, index=native_idx) + snow_df = pd.DataFrame(data, index=snow_idx) + + # Rename the index in place. + native_res = native_df.index.rename(new_name, inplace=True) + snow_res = snow_df.index.rename(new_name, inplace=True) + + # Verify that the return value is None, and `name` and `names` match. + assert native_res is None + assert snow_res is None + assert native_df.index.name == snow_df.index.name == new_name + assert native_df.index.names == snow_df.index.names == [new_name] + + +@pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) +@sql_count_checker(query_count=1) +def test_df_index_rename_copy(new_name): + # 1 query to create the DataFrame. + # Create the DataFrame and the new index. + native_idx = native_pd.Index(["A", "C"], name="score") + snow_idx = pd.Index(native_idx) + data = [[1, 2], [3, 4]] + native_df = native_pd.DataFrame(data, index=native_idx) + snow_df = pd.DataFrame(data, index=snow_idx) + + # Rename the index and create a new index. + new_native_idx = native_df.index.rename(new_name, inplace=False) + new_snow_idx = snow_df.index.rename(new_name, inplace=False) + + # Verify that `name` and `names` match, and the original index's name is unchanged. + assert new_native_idx.name == new_snow_idx.name == new_name + assert new_native_idx.names == new_snow_idx.names == [new_name] + assert native_df.index.name == snow_df.index.name == "score" + + +@pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) +@sql_count_checker(query_count=0) +def test_index_set_names_inplace(new_name): + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + + # Rename the index in place. + native_res = native_idx.set_names(new_name, inplace=True) + snow_res = snow_idx.set_names(new_name, inplace=True) + + # Verify that the return value is None, and `name` and `names` match. + assert native_res is None + assert snow_res is None + assert native_idx.name == snow_idx.name == (None if new_name is None else "grade") + assert ( + native_idx.names + == snow_idx.names + == ([None] if new_name is None else ["grade"]) + ) + + +@pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) +@sql_count_checker(query_count=0) +def test_index_set_names_copy(new_name): + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + + # Rename the index and create a new index. + new_native_idx = native_idx.set_names(new_name, inplace=False) + new_snow_idx = snow_idx.set_names(new_name, inplace=False) + + # Verify that `name` and `names` match, and the original index's name is unchanged. + assert ( + new_native_idx.name + == new_snow_idx.name + == (None if new_name is None else "grade") + ) + assert ( + new_native_idx.names + == new_snow_idx.names + == ([None] if new_name is None else ["grade"]) + ) + assert native_idx.name == snow_idx.name == "score" + + +@pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) +@sql_count_checker(query_count=1) +def test_df_index_set_names_inplace(new_name): + # 1 query to create the DataFrame. + # Create the DataFrame and the new index. + native_idx = native_pd.Index(["A", "C"], name="score") + snow_idx = pd.Index(native_idx) + data = [[1, 2], [3, 4]] + native_df = native_pd.DataFrame(data, index=native_idx) + snow_df = pd.DataFrame(data, index=snow_idx) + + # Rename the index in place. + native_res = native_df.index.set_names(new_name, inplace=True) + snow_res = snow_df.index.set_names(new_name, inplace=True) + + # Verify that the return value is None, and `name` and `names` match. + assert native_res is None + assert snow_res is None + assert ( + native_df.index.name + == snow_df.index.name + == (None if new_name is None else "grade") + ) + assert ( + native_df.index.names + == snow_df.index.names + == ([None] if new_name is None else ["grade"]) + ) + + +@pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) +@sql_count_checker(query_count=1) +def test_df_index_set_names_copy(new_name): + # 1 query to create the DataFrame. + # Create the DataFrame and the new index. + native_idx = native_pd.Index(["A", "C"], name="score") + snow_idx = pd.Index(native_idx) + data = [[1, 2], [3, 4]] + native_df = native_pd.DataFrame(data, index=native_idx) + snow_df = pd.DataFrame(data, index=snow_idx) + + # Rename the index and create a new index. + new_native_idx = native_df.index.set_names(new_name, inplace=False) + new_snow_idx = snow_df.index.set_names(new_name, inplace=False) + + # Verify that `name` and `names` match, and the original index's name is unchanged. + assert ( + new_native_idx.name + == new_snow_idx.name + == (None if new_name is None else "grade") + ) + assert ( + new_native_idx.names + == new_snow_idx.names + == ([None] if new_name is None else ["grade"]) + ) + assert native_df.index.name == snow_df.index.name == "score" + + +@pytest.mark.parametrize("inplace", [True, False]) +@sql_count_checker(query_count=0) +def test_index_rename_list(inplace): + # In native pandas, `rename` only works with hashable datatypes, however `set_names` works with + # non-hashable datatypes as well (these are usually list-like types). + # In Snowpark pandas, both `rename` and `set_names` work with non-hashable datatypes. + # Verify the behavior in native pandas and Snowpark pandas. + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + new_name = ["grade"] + + with pytest.raises(TypeError, match="Index.name must be a hashable type"): + native_idx.rename(new_name, inplace=inplace) + + res = snow_idx.rename(new_name, inplace=inplace) + if inplace: + assert res is None + assert snow_idx.name == "grade" + assert snow_idx.names == ["grade"] + else: + assert res.name == "grade" + assert res.names == ["grade"] + assert snow_idx.name == "score" + + +@pytest.mark.parametrize("level", [-10, 1, "abc"]) +@sql_count_checker(query_count=0) +def test_index_set_names_invalid_level(level): + idx = pd.Index(["A", "C", "A", "B"], name="score") + err_msg = f"Level does not exist: Index has only 1 level, {level} is not a valid level number." + with pytest.raises(IndexError, match=err_msg): + idx.set_names("grade", level=level) + + +@pytest.mark.parametrize("level", [0, -1]) +@sql_count_checker(query_count=0) +def test_index_set_names_level(level): + # The level parameter works for Snowpark pandas even in the case of a single Index. + # However, native pandas does not allow you to specify the level unless the index is + # a MultiIndex. + native_idx = native_pd.Index(["A", "C", "A", "B"], name="score") + snow_idx = pd.Index(native_idx) + with pytest.raises(ValueError, match="Level must be None for non-MultiIndex"): + native_idx.set_names("grade", level=level) + + # Verifying the results. + native_res = native_idx.set_names("grade") + snow_res = snow_idx.set_names("grade", level=level) + assert native_res.name == snow_res.name == "grade" + + +@sql_count_checker(query_count=0) +def test_index_non_hashable_name(): + idx = pd.Index(["A", "C", "A", "B"], name="score") + with pytest.raises(TypeError, match="Index.name must be a hashable type"): + idx.name = ["grade"] + + +@sql_count_checker(query_count=1) +def test_index_SNOW_1021837(): + """ + Bug SNOW-1021837: + Previously, updating index.name inplace did not affect index column name after reset_index(). + This test verifies that index column names are correctly updated. + """ + native_df = native_pd.DataFrame([0]) + snow_df = pd.DataFrame(native_df) + + # Set the index names. + native_df.index.name = "index_name" + snow_df.index.name = "index_name" + + # Perform reset index and check if name is correctly updated. + native_df_reset = native_df.reset_index() + snow_df_reset = snow_df.reset_index() + assert_frame_equal(snow_df_reset, native_df_reset) diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 1049d5ea21b..f3e149a37fc 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -656,23 +656,13 @@ def test_concat_keys_with_none(df1, df2, axis): "name1, name2", [("one", "two"), ("one", None), (None, "two"), (None, None)] ) def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis): - with SqlCounter(query_count=0 if name1 is None or axis == 1 else 3, join_count=0): + with SqlCounter(query_count=0 if name1 is None or axis == 1 else 2): df1 = df1.rename_axis(name1, axis=axis) - with SqlCounter(query_count=0 if name2 is None or axis == 1 else 3, join_count=0): + with SqlCounter(query_count=0 if name2 is None or axis == 1 else 2): df2 = df2.rename_axis(name2, axis=axis) - expected_join_count = ( - 1 if name1 is not None or name2 is not None or axis == 1 else 0 - ) - if axis == 0: - if name1 is not None: - expected_join_count += 1 - if name2 is not None: - expected_join_count += 1 - if name1 is not None and name2 is not None: - expected_join_count += 1 # One extra query to convert index to native pandas when creating df - with SqlCounter(query_count=3, join_count=expected_join_count): + with SqlCounter(query_count=3): eval_snowpark_pandas_result( "pd", "native_pd", diff --git a/tests/integ/modin/test_unimplemented.py b/tests/integ/modin/test_unimplemented.py index 74e167abb8b..5e865c418b4 100644 --- a/tests/integ/modin/test_unimplemented.py +++ b/tests/integ/modin/test_unimplemented.py @@ -169,7 +169,6 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None: lambda idx: idx.is_object(), lambda idx: idx.min(), lambda idx: idx.max(), - lambda idx: idx.rename(), lambda idx: idx.repeat(), lambda idx: idx.where(), lambda idx: idx.take(),