From 5e360be08cd3b6b837b8d3552cb9844edf7089e9 Mon Sep 17 00:00:00 2001 From: Naresh Kumar <113932371+sfc-gh-nkumar@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:34:23 -0700 Subject: [PATCH] SNOW-1573193: Remove local index from pd.Index (#2031) --- CHANGELOG.md | 2 + .../snowpark/modin/pandas/dataframe.py | 2 +- .../snowpark/modin/plugin/_internal/frame.py | 7 +- .../modin/plugin/_internal/indexing_utils.py | 15 +- .../compiler/snowflake_query_compiler.py | 14 +- .../snowpark/modin/plugin/extensions/index.py | 215 ++---------------- tests/integ/modin/frame/test_axis.py | 19 +- tests/integ/modin/frame/test_getattr.py | 3 +- tests/integ/modin/frame/test_loc.py | 14 +- tests/integ/modin/frame/test_set_index.py | 2 +- tests/integ/modin/index/test_astype.py | 14 +- tests/integ/modin/index/test_equals.py | 25 +- tests/integ/modin/index/test_index_methods.py | 28 +-- .../integ/modin/test_from_pandas_to_pandas.py | 2 +- tests/integ/modin/utils.py | 2 +- tests/unit/modin/test_type_annotations.py | 3 +- 16 files changed, 62 insertions(+), 305 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 122fc2a2339..01861f5a6d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,8 @@ - Fixed a bug in `Index.to_frame` where the result frame's column name may be wrong where name is unspecified. - Fixed a bug where some Index docstrings are ignored. +### Behavior change +- `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object. ## 1.20.0 (2024-07-17) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 19725df4fe3..693a7aa676d 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -317,7 +317,7 @@ def _repr_html_(self): # pragma: no cover else: return result - def _get_columns(self) -> pd.Index: + def _get_columns(self) -> pandas.Index: """ Get the columns for this Snowpark pandas ``DataFrame``. diff --git a/src/snowflake/snowpark/modin/plugin/_internal/frame.py b/src/snowflake/snowpark/modin/plugin/_internal/frame.py index 11f457ac694..ab01baf676d 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/frame.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/frame.py @@ -24,7 +24,6 @@ last_value, max as max_, ) -from snowflake.snowpark.modin import pandas as pd from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import ( OrderedDataFrame, OrderingColumn, @@ -378,7 +377,7 @@ def is_unnamed_series(self) -> bool: ) @property - def data_columns_index(self) -> "pd.Index": + def data_columns_index(self) -> native_pd.Index: """ Returns Snowpark pandas Index object for column index (df.columns). Note this object will still hold an internal pandas index (i.e., not lazy) to avoid unnecessary pulling data from Snowflake. @@ -389,15 +388,13 @@ def data_columns_index(self) -> "pd.Index": names=self.data_column_pandas_index_names, ) else: - return pd.Index( + return native_pd.Index( self.data_column_pandas_labels, name=self.data_column_pandas_index_names[0], # setting tupleize_cols=False to avoid creating a MultiIndex # otherwise, when labels are tuples (e.g., [("A", "a"), ("B", "b")]), # a MultiIndex will be created incorrectly tupleize_cols=False, - # setting is_lazy as false because we want to store the columns locally - convert_to_lazy=False, ) def index_columns_pandas_index(self, **kwargs: Any) -> native_pd.Index: diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py index d8ef535cb01..041c5069ccd 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py @@ -776,8 +776,8 @@ def _extract_loc_set_col_info( label for label in columns if label not in frame_data_columns ] columns = [label for label in columns if label in frame_data_columns] - before = frame_data_columns.to_pandas().value_counts() - after = union_data_columns.to_pandas().value_counts() + before = frame_data_columns.value_counts() + after = union_data_columns.value_counts() frame_data_col_labels = frame_data_columns.tolist() for label in after.index: if label in frame_data_columns: @@ -872,7 +872,9 @@ def get_valid_col_positions_from_col_labels( ) ) ) - col_loc = pd.Index(col_loc, convert_to_lazy=False) + col_loc = col_loc.index + if isinstance(col_loc, pd.Index): + col_loc = col_loc.to_pandas() # get the position of the selected labels return [pos for pos, label in enumerate(columns) if label in col_loc] else: @@ -939,11 +941,8 @@ def get_valid_col_positions_from_col_labels( # Convert col_loc to Index with object dtype since _get_indexer_strict() converts None values in lists to # np.nan. This does not filter columns with label None and errors. Not using np.array(col_loc) as the key since # np.array(["A", 12]) turns into array(['A', '12']. - col_loc = pd.Index( - [label for label in col_loc if label in columns], - dtype=object, - # we do not convert to lazy because we are using this index as columns - convert_to_lazy=False, + col_loc = native_pd.Index( + [label for label in col_loc if label in columns], dtype=object ) # `Index._get_indexer_strict` returns position index from label index diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index ad92f0966b4..70198bdcea4 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -1322,7 +1322,7 @@ def cache_result(self) -> "SnowflakeQueryCompiler": return SnowflakeQueryCompiler(self._modin_frame.persist_to_temporary_table()) @property - def columns(self) -> "pd.Index": + def columns(self) -> native_pd.Index: """ Get pandas column labels. @@ -2399,17 +2399,7 @@ def _reindex_axis_1( limit = kwargs.get("limit", None) tolerance = kwargs.get("tolerance", None) fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] - # Currently, our error checking relies on the column axis being eager (i.e. stored - # locally as a pandas Index, rather than pushed down to the database). This allows - # us to have parity with native pandas for things like monotonicity checks. If - # our columns are no longer eagerly stored, we would no longer be able to rely - # on pandas for these error checks, and the behaviour of reindex would change. - # This change is user-facing, so we should catch this in CI first, which we can - # by having this assert here, as a sentinel. - assert ( - not self.columns.is_lazy - ), "`reindex` with axis=1 failed on error checking." - self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) + self.columns.reindex(labels, method, level, limit, tolerance) data_column_pandas_labels = [] data_column_snowflake_quoted_identifiers = [] modin_frame = self._modin_frame diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index cc325bd32ee..876fe470956 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -23,7 +23,6 @@ from __future__ import annotations -from functools import wraps from typing import Any, Callable, Hashable, Iterator, Literal import modin @@ -48,63 +47,6 @@ from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage -def is_lazy_check(func: Callable) -> Callable: - """ - Decorator method for separating function calls for lazy indexes and non-lazy (column) indexes - """ - - @wraps(func) - def check_lazy(*args: Any, **kwargs: Any) -> Any: - func_name = func.__name__ - - # If the index is lazy, call the method and return - if args[0].is_lazy: - returned_value = func(*args, **kwargs) - return returned_value - else: - # If the index is not lazy, get the cached native index and call the function - native_index = args[0]._index - native_func = getattr(native_index, func_name) - - # If the function is a property, we will get a non-callable, so we just return it - # Examples of this are values or dtype - if not callable(native_func): - return native_func - - # Remove the first argument in args, because it is `self` and we don't need it - args = args[1:] - args = tuple(try_convert_index_to_native(a) for a in args) - for k, v in kwargs.items(): - kwargs[k] = try_convert_index_to_native(v) - returned_value = native_func(*args, **kwargs) - - # If we return a native Index, we need to convert this to a modin index but keep it locally. - # Examples of this are `astype` and `copy` - if isinstance(returned_value, native_pd.Index): - returned_value = Index(returned_value, convert_to_lazy=False) - # Some methods also return a tuple with a pandas Index, so convert the tuple's first item to a modin Index - # Examples of this are `_get_indexer_strict` and `sort_values` - elif isinstance(returned_value, tuple) and isinstance( - returned_value[0], native_pd.Index - ): - returned_value = ( - Index(returned_value[0], convert_to_lazy=False), - returned_value[1], - ) - # For methods that return a series, convert this series to snowpark pandas - # an example is to_series - elif isinstance(returned_value, native_pd.Series): - returned_value = Series(returned_value) - - # for methods that return a dataframe, convert this dataframe to snowpark pandas - elif isinstance(returned_value, native_pd.DataFrame): - returned_value = DataFrame(returned_value) - - return returned_value - - return check_lazy - - class Index(metaclass=TelemetryMeta): def __init__( self, @@ -113,7 +55,6 @@ def __init__( copy: bool = False, name: object = None, tupleize_cols: bool = True, - convert_to_lazy: bool = True, ) -> None: """ Immutable sequence used for indexing and alignment. @@ -133,9 +74,6 @@ def __init__( Name to be stored in the index. tupleize_cols : bool (default: True) When True, attempt to create a MultiIndex if possible. - convert_to_lazy : bool (default: True) - When True, create a lazy index object from a local data input, otherwise, create an index object that saves a pandas index locally. - We only set convert_to_lazy as False to avoid pulling data back and forth from Snowflake, e.g., when calling df.columns, the column data should always be kept locally. Notes ----- @@ -154,35 +92,6 @@ def __init__( >>> pd.Index([1, 2, 3], dtype="uint8") Index([1, 2, 3], dtype='int64') """ - self.is_lazy = convert_to_lazy - if self.is_lazy: - self.set_query_compiler( - data=data, - dtype=dtype, - copy=copy, - name=name, - tupleize_cols=tupleize_cols, - ) - else: - self.set_local_index( - data=data, - dtype=dtype, - copy=copy, - name=name, - tupleize_cols=tupleize_cols, - ) - - def set_query_compiler( - self, - data: ArrayLike | SnowflakeQueryCompiler | None = None, - dtype: str | np.dtype | ExtensionDtype | None = None, - copy: bool = False, - name: object = None, - tupleize_cols: bool = True, - ) -> None: - """ - Helper method to find and save query compiler when index should be lazy - """ if isinstance(data, SnowflakeQueryCompiler): qc = data else: @@ -197,29 +106,6 @@ def set_query_compiler( )._query_compiler self._query_compiler = qc.drop(columns=qc.columns) - def set_local_index( - self, - data: ArrayLike | SnowflakeQueryCompiler | None = None, - dtype: str | np.dtype | ExtensionDtype | None = None, - copy: bool = False, - name: object = None, - tupleize_cols: bool = True, - ) -> None: - """ - Helper method to create and save local index when index should not be lazy - """ - if isinstance(data, SnowflakeQueryCompiler): - index = data._modin_frame.index_columns_pandas_index() - else: - index = native_pd.Index( - data=data, - dtype=dtype, - copy=copy, - name=name, - tupleize_cols=tupleize_cols, - ) - self._index = index - def __getattr__(self, key: str) -> Any: """ Return item identified by `key`. @@ -267,11 +153,9 @@ def to_pandas( pandas Index A native pandas Index representation of self """ - if self.is_lazy: - return self._query_compiler._modin_frame.index_columns_pandas_index( - statement_params=statement_params, **kwargs - ) - return self._index + return self._query_compiler._modin_frame.index_columns_pandas_index( + statement_params=statement_params, **kwargs + ) @property def values(self) -> ArrayLike: @@ -367,12 +251,9 @@ def is_unique(self) -> bool: >>> idx.is_unique True """ - if not self.is_lazy: - return self._index.is_unique return self._query_compiler._modin_frame.has_unique_index() @property - @is_lazy_check def has_duplicates(self) -> bool: """ Check if the Index has duplicate values. @@ -408,7 +289,6 @@ def has_duplicates(self) -> bool: """ return not self.is_unique - @is_lazy_check def unique(self, level: Hashable | None = None) -> Index: """ Return unique values in the index. @@ -452,7 +332,6 @@ def unique(self, level: Hashable | None = None) -> Index: ) @property - @is_lazy_check def dtype(self) -> DtypeObj: """ Get the dtype object of the underlying data. @@ -492,7 +371,6 @@ def shape(self) -> tuple: """ return (len(self),) - @is_lazy_check def astype(self, dtype: str | type | ExtensionDtype, copy: bool = True) -> Index: """ Create an Index with values cast to dtypes. @@ -563,19 +441,13 @@ def name(self, value: Hashable) -> None: """ Set Index name. """ - if self.is_lazy: - self._query_compiler = self._query_compiler.set_index_names([value]) - else: - self._index.name = value + self._query_compiler = self._query_compiler.set_index_names([value]) def _get_names(self) -> list[Hashable]: """ Get names of index """ - if self.is_lazy: - return self._query_compiler.get_index_names() - else: - return self.to_pandas().names + return self._query_compiler.get_index_names() def _set_names(self, values: list) -> None: """ @@ -590,10 +462,7 @@ def _set_names(self, values: list) -> None: ------ TypeError if each name is not hashable. """ - if self.is_lazy: - self._query_compiler = self._query_compiler.set_index_names(values) - else: - self._index.names = values + self._query_compiler = self._query_compiler.set_index_names(values) names = property(fset=_set_names, fget=_get_names) @@ -632,8 +501,7 @@ def set_names( WarningMessage.index_to_pandas_warning("set_names") if not inplace: return Index( - self.to_pandas().set_names(names, level=level, inplace=inplace), - convert_to_lazy=self.is_lazy, + self.to_pandas().set_names(names, level=level, inplace=inplace) ) return self.to_pandas().set_names(names, level=level, inplace=inplace) @@ -665,7 +533,6 @@ def size(self) -> int: return len(self) @property - @is_lazy_check def nlevels(self) -> int: """ Number of levels. @@ -862,7 +729,6 @@ def argmax(self) -> None: """ # TODO: SNOW-1458142 implement argmax - @is_lazy_check def copy( self, name: Hashable | None = None, @@ -897,11 +763,7 @@ def copy( False """ WarningMessage.ignored_argument(operation="copy", argument="deep", message="") - return Index( - self._query_compiler.copy(), - name=name, - convert_to_lazy=self.is_lazy, - ) + return Index(self._query_compiler.copy(), name=name) @index_not_implemented() def delete(self) -> None: @@ -925,7 +787,6 @@ def delete(self) -> None: """ # TODO: SNOW-1458146 implement delete - @is_lazy_check def drop( self, labels: Any, @@ -958,10 +819,7 @@ def drop( """ # TODO: SNOW-1458146 implement drop WarningMessage.index_to_pandas_warning("drop") - return Index( - self.to_pandas().drop(labels=labels, errors=errors), - convert_to_lazy=self.is_lazy, - ) + return Index(self.to_pandas().drop(labels=labels, errors=errors)) @index_not_implemented() def drop_duplicates(self) -> None: @@ -987,7 +845,6 @@ def drop_duplicates(self) -> None: """ # TODO: SNOW-1458147 implement drop_duplicates - @is_lazy_check def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray: """ Indicate duplicate index values. @@ -1116,20 +973,9 @@ def equals(self, other: Any) -> bool: if isinstance(other, native_pd.Index): # Same as DataFrame/Series equals. Convert native Index to Snowpark pandas # Index for comparison. - other = Index(other, convert_to_lazy=self.is_lazy) - - left = self - right = other - # If both are cached compare underlying cached value locally. - if not left.is_lazy and not right.is_lazy: - return left._index.equals(right._index) + other = Index(other) - # Ensure both sides are lazy before calling index_equals on query_compiler. - if not left.is_lazy: - left = Index(left._index, convert_to_lazy=True) - if not right.is_lazy: - right = Index(right._index, convert_to_lazy=True) - return left._query_compiler.index_equals(right._query_compiler) + return self._query_compiler.index_equals(other._query_compiler) @index_not_implemented() def identical(self) -> None: @@ -1445,7 +1291,6 @@ def rename(self) -> None: """ # TODO: SNOW-1458122 implement rename - @is_lazy_check def nunique(self, dropna: bool = True) -> int: """ Return number of unique elements in the object. @@ -1482,7 +1327,6 @@ def nunique(self, dropna: bool = True) -> int: """ return self._query_compiler.nunique_index(dropna=dropna) - @is_lazy_check def value_counts( self, normalize: bool = False, @@ -1564,7 +1408,6 @@ def value_counts( name="proportion" if normalize else "count", ) - @is_lazy_check def item(self) -> Hashable: """ Return the first element of the underlying data as a Python scalar. @@ -1591,7 +1434,6 @@ def item(self) -> Hashable: # otherwise raise the same value error as pandas raise ValueError("can only convert an array of size 1 to a Python scalar") - @is_lazy_check def to_series( self, index: Index | None = None, name: Hashable | None = None ) -> Series: @@ -1635,7 +1477,6 @@ def to_series( ser.name = name return ser - @is_lazy_check def to_frame( self, index: bool = True, name: Hashable | None = lib.no_default ) -> modin.pandas.DataFrame: @@ -1862,7 +1703,6 @@ def tolist(self) -> list: to_list = tolist - @is_lazy_check def sort_values( self, return_indexer: bool = False, @@ -1939,7 +1779,7 @@ def sort_values( key=key, include_indexer=return_indexer, ) - index = Index(res, convert_to_lazy=self.is_lazy) + index = Index(res) if return_indexer: # When `return_indexer` is True, `res` is a query compiler with one index column # and one data column. @@ -1988,7 +1828,6 @@ def join(self) -> None: """ # TODO: SNOW-1458150 implement join - @is_lazy_check def intersection(self, other: Any, sort: bool = False) -> Index: """ Form the intersection of two Index objects. @@ -2023,11 +1862,9 @@ def intersection(self, other: Any, sort: bool = False) -> Index: return Index( self.to_pandas().intersection( other=try_convert_index_to_native(other), sort=sort - ), - convert_to_lazy=self.is_lazy, + ) ) - @is_lazy_check def union(self, other: Any, sort: bool = False) -> Index: """ Form the union of two Index objects. @@ -2076,11 +1913,9 @@ def union(self, other: Any, sort: bool = False) -> Index: # TODO: SNOW-1468240 implement union w/ sort WarningMessage.index_to_pandas_warning("union") return Index( - self.to_pandas().union(other=try_convert_index_to_native(other), sort=sort), - convert_to_lazy=self.is_lazy, + self.to_pandas().union(other=try_convert_index_to_native(other), sort=sort) ) - @is_lazy_check def difference(self, other: Any, sort: Any = None) -> Index: """ Return a new Index with elements of index not in `other`. @@ -2117,11 +1952,9 @@ def difference(self, other: Any, sort: Any = None) -> Index: # TODO: SNOW-1458152 implement difference WarningMessage.index_to_pandas_warning("difference") return Index( - self.to_pandas().difference(try_convert_index_to_native(other), sort=sort), - convert_to_lazy=self.is_lazy, + self.to_pandas().difference(try_convert_index_to_native(other), sort=sort) ) - @is_lazy_check def get_indexer_for(self, target: Any) -> Any: """ Guaranteed return of an indexer even when non-unique. @@ -2144,16 +1977,14 @@ def get_indexer_for(self, target: Any) -> Any: WarningMessage.index_to_pandas_warning("get_indexer_for") return self.to_pandas().get_indexer_for(target=target) - @is_lazy_check def _get_indexer_strict(self, key: Any, axis_name: str) -> tuple[Index, np.ndarray]: """ Analogue to pandas.Index.get_indexer that raises if any elements are missing. """ WarningMessage.index_to_pandas_warning("_get_indexer_strict") tup = self.to_pandas()._get_indexer_strict(key=key, axis_name=axis_name) - return Index(tup[0], convert_to_lazy=self.is_lazy), tup[1] + return Index(tup[0]), tup[1] - @is_lazy_check def get_level_values(self, level: int | str) -> Index: """ Return an Index of values for requested level. @@ -2187,9 +2018,7 @@ def get_level_values(self, level: int | str) -> Index: Index(['a', 'b', 'c'], dtype='object') """ WarningMessage.index_to_pandas_warning("get_level_values") - return Index( - self.to_pandas().get_level_values(level=level), convert_to_lazy=self.is_lazy - ) + return Index(self.to_pandas().get_level_values(level=level)) @index_not_implemented() def isin(self) -> None: @@ -2232,7 +2061,6 @@ def isin(self) -> None: """ # TODO: SNOW-1458153 implement isin - @is_lazy_check def slice_indexer( self, start: Hashable | None = None, @@ -2278,14 +2106,12 @@ def slice_indexer( return self.to_pandas().slice_indexer(start=start, end=end, step=step) @property - @is_lazy_check def array(self) -> ExtensionArray: """ return the array of values """ return self.to_pandas().array - @is_lazy_check def _summary(self, name: Any = None) -> str: """ Return a summarized representation. @@ -2303,14 +2129,12 @@ def _summary(self, name: Any = None) -> str: WarningMessage.index_to_pandas_warning("_summary") return self.to_pandas()._summary(name=name) - @is_lazy_check def __array__(self, dtype: Any = None) -> np.ndarray: """ The array interface, return the values. """ return self.to_pandas().__array__(dtype=dtype) - @is_lazy_check def __repr__(self) -> str: """ Return a string representation for this object. @@ -2318,7 +2142,6 @@ def __repr__(self) -> str: WarningMessage.index_to_pandas_warning("__repr__") return self.to_pandas().__repr__() - @is_lazy_check def __iter__(self) -> Iterator: """ Return an iterator of the values. @@ -2344,7 +2167,6 @@ def __iter__(self) -> Iterator: WarningMessage.index_to_pandas_warning("__iter__") return self.to_pandas().__iter__() - @is_lazy_check def __contains__(self, key: Any) -> bool: """ Return a boolean indicating whether the provided key is in the index. @@ -2378,14 +2200,12 @@ def __contains__(self, key: Any) -> bool: WarningMessage.index_to_pandas_warning("__contains__") return self.to_pandas().__contains__(key=key) - @is_lazy_check def __len__(self) -> int: """ Return the length of the Index as an int. """ return self._query_compiler.get_axis_len(0) - @is_lazy_check def __getitem__(self, key: Any) -> np.ndarray | None | Index: """ Reuse series iloc to implement getitem for index. @@ -2401,7 +2221,6 @@ def __getitem__(self, key: Any) -> np.ndarray | None | Index: "boolean arrays are valid indices" ) from ie - @is_lazy_check def __setitem__(self, key: Any, value: Any) -> None: """ Override numpy.ndarray's __setitem__ method to work as desired. diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py index 486aee14f95..b253906ba53 100644 --- a/tests/integ/modin/frame/test_axis.py +++ b/tests/integ/modin/frame/test_axis.py @@ -155,19 +155,10 @@ def set_columns_func(df, labels): ) @sql_count_checker(query_count=0) def test_set_columns(columns): - if isinstance(columns, native_pd.Index) and not isinstance( - columns, native_pd.MultiIndex - ): - snow_columns = pd.Index(columns, convert_to_lazy=False) - else: - snow_columns = columns - eval_snowpark_pandas_result( pd.DataFrame(test_dfs[0].copy()), test_dfs[0].copy(), - lambda df: set_columns_func( - df, labels=snow_columns if isinstance(df, pd.DataFrame) else columns - ), + lambda df: set_columns_func(df, columns), comparator=assert_index_equal, ) @@ -227,16 +218,10 @@ def test_set_columns_valid_names(col_name): ) @sql_count_checker(query_count=0) def test_set_columns_negative(columns, error_type, error_msg): - if isinstance(columns, native_pd.Index): - snow_columns = pd.Index(columns, convert_to_lazy=False) - else: - snow_columns = columns eval_snowpark_pandas_result( pd.DataFrame(test_dfs[0]), test_dfs[0], - lambda df: set_columns_func( - df, labels=snow_columns if isinstance(df, pd.DataFrame) else columns - ), + lambda df: set_columns_func(df, labels=columns), comparator=assert_index_equal, expect_exception=True, expect_exception_type=error_type, diff --git a/tests/integ/modin/frame/test_getattr.py b/tests/integ/modin/frame/test_getattr.py index db74dfa107e..a2d6f32577c 100644 --- a/tests/integ/modin/frame/test_getattr.py +++ b/tests/integ/modin/frame/test_getattr.py @@ -5,6 +5,7 @@ import inspect import modin.pandas as pd +import pandas import pandas as native_pd import pytest @@ -33,7 +34,7 @@ def test_getattr(name, expected_query_count): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_res, native_res ) - elif isinstance(snow_res, pd.Index): + elif isinstance(snow_res, (pd.Index, pandas.Index)): assert_index_equal(snow_res, native_res, exact=False) else: # e.g., mean will return bound method similar to pandas diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 3c3ac6cdee8..381a3034c12 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -144,10 +144,6 @@ def test_df_loc_get_tuple_key( snow_row = pd.Index(row) else: snow_row = row - if isinstance(col, native_pd.Index): - snow_col = pd.Index(col, convert_to_lazy=False) - else: - snow_col = col query_count = 1 if is_scalar(row) or isinstance(row, tuple) or isinstance(row, native_pd.Index): @@ -159,7 +155,7 @@ def test_df_loc_get_tuple_key( eval_snowpark_pandas_result( str_index_snowpark_pandas_df, str_index_native_df, - lambda df: df.loc[snow_row, snow_col] + lambda df: df.loc[snow_row, col] if isinstance(df, pd.DataFrame) else df.loc[row, col], ) @@ -236,10 +232,7 @@ def test_df_loc_get_col_boolean_indexer( str_index_native_df, lambda df: df.loc[ :, - pd.Series( - key, - index=pd.Index(str_index_native_df.columns, convert_to_lazy=False), - ) + pd.Series(key, index=str_index_native_df.columns) if isinstance(df, pd.DataFrame) else native_pd.Series(key, index=str_index_native_df.columns), ], @@ -2917,9 +2910,6 @@ def test_df_loc_set_with_column_wise_list_like_item( snow_df = pd.DataFrame(native_df) native_item = item - if isinstance(col_key, native_pd.Index): - col_key = pd.Index(col_key, convert_to_lazy=False) - def loc_set_helper(df): if isinstance(df, pd.DataFrame): df.loc[row_key, col_key] = item_to_type(item) diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py index 6bd6d754278..15566d630f1 100644 --- a/tests/integ/modin/frame/test_set_index.py +++ b/tests/integ/modin/frame/test_set_index.py @@ -100,7 +100,7 @@ def test_set_index_negative(snow_df, native_df): @sql_count_checker(query_count=1) def test_set_index_dup_column_name(): snow_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) - snow_df.columns = pd.Index(["A", "A", "B"], convert_to_lazy=False) + snow_df.columns = native_pd.Index(["A", "A", "B"]) eval_snowpark_pandas_result( snow_df, snow_df.to_pandas(), diff --git a/tests/integ/modin/index/test_astype.py b/tests/integ/modin/index/test_astype.py index cf84723fe2b..97013c5d211 100644 --- a/tests/integ/modin/index/test_astype.py +++ b/tests/integ/modin/index/test_astype.py @@ -42,10 +42,9 @@ (native_pd.Index([1.11, 2.1111, 3.0002, 4.111], dtype=object), np.float64), ], ) -@pytest.mark.parametrize("is_lazy", [True, False]) -def test_index_astype(index, type, is_lazy): - snow_index = pd.Index(index, convert_to_lazy=is_lazy) - with SqlCounter(query_count=1 if is_lazy else 0): +def test_index_astype(index, type): + snow_index = pd.Index(index) + with SqlCounter(query_count=1): assert_index_equal(snow_index.astype(type), index.astype(type)) @@ -100,11 +99,10 @@ def test_index_df_columns_astype(index, type): @pytest.mark.parametrize("from_type", [str, np.int64, np.float64, object, bool]) @pytest.mark.parametrize("to_type", [str, np.int64, np.float64, object, bool]) -@pytest.mark.parametrize("is_lazy", [True, False]) -def test_index_astype_empty_index(from_type, to_type, is_lazy): +def test_index_astype_empty_index(from_type, to_type): native_index = native_pd.Index([], dtype=from_type) - snow_index = pd.Index(native_index, convert_to_lazy=is_lazy) - with SqlCounter(query_count=1 if is_lazy else 0): + snow_index = pd.Index(native_index) + with SqlCounter(query_count=1): assert_index_equal(snow_index.astype(to_type), native_index.astype(to_type)) diff --git a/tests/integ/modin/index/test_equals.py b/tests/integ/modin/index/test_equals.py index 96a9caa8720..ccc2165ae9e 100644 --- a/tests/integ/modin/index/test_equals.py +++ b/tests/integ/modin/index/test_equals.py @@ -23,22 +23,15 @@ ([1, 2, 3, 4], [1, 2, 3], False), # extra value in left ], ) -@pytest.mark.parametrize("is_left_lazy", [True, False]) -@pytest.mark.parametrize("is_right_lazy", [True, False]) -def test_index_equals(lhs, rhs, expected, is_left_lazy, is_right_lazy): - query_count = int(is_left_lazy) + int(is_right_lazy) - with SqlCounter(query_count=query_count, join_count=query_count): +def test_index_equals(lhs, rhs, expected): + with SqlCounter(query_count=2, join_count=2): native_result = native_pd.Index(lhs).equals(native_pd.Index(rhs)) assert native_result == expected - snow_result = pd.Index(lhs, convert_to_lazy=is_right_lazy).equals( - pd.Index(rhs, convert_to_lazy=is_right_lazy) - ) + snow_result = pd.Index(lhs).equals(pd.Index(rhs)) assert snow_result == expected - mixed_result = pd.Index(lhs, convert_to_lazy=is_left_lazy).equals( - native_pd.Index(rhs) - ) + mixed_result = pd.Index(lhs).equals(native_pd.Index(rhs)) assert mixed_result == expected @@ -62,16 +55,6 @@ def test_index_equals_other_types(): assert index.equals(series) is False -@sql_count_checker(query_count=2, join_count=2) -def test_index_lazy_with_non_lazy(): - # Lazy index - index1 = pd.Index([1, 2]) - # Non lazy index - index2 = pd.Index([1, 2], convert_to_lazy=False) - assert index1.equals(index2) - assert index2.equals(index1) - - @sql_count_checker(query_count=0) def test_index_columns_self_compare(): # Bug SNOW-1478684 diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py index 417089f13df..37aa942680f 100644 --- a/tests/integ/modin/index/test_index_methods.py +++ b/tests/integ/modin/index/test_index_methods.py @@ -58,7 +58,7 @@ def test_df_index_equals(native_df): snow_df = pd.DataFrame(native_df) assert native_df.columns.equals(native_df.columns) assert snow_df.columns.equals(snow_df.columns) - assert native_df.columns.equals(snow_df.columns.to_pandas()) + assert native_df.columns.equals(snow_df.columns) assert snow_df.columns.equals(native_df.columns) assert native_df.index.equals(native_df.index) @@ -270,7 +270,7 @@ def test_df_index_columns_to_series(native_df, generate_extra_index, name): row_index = None col_index = None - with SqlCounter(query_count=2, join_count=1 if generate_extra_index else 0): + with SqlCounter(query_count=1, join_count=1 if generate_extra_index else 0): assert_series_equal( snow_df.index.to_series(index=row_index, name=name), native_df.index.to_series(index=row_index, name=name), @@ -298,11 +298,11 @@ def test_index_to_frame(native_index, name, index): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("name", [None, "name", True, 1, lib.no_default]) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("native_df", TEST_DFS) -def test_df_index_columns_to_frame(native_df, index, name): +def test_df_index_to_frame(native_df, index, name): snow_df = pd.DataFrame(native_df) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_df.index.to_frame(index=index, name=name), @@ -310,12 +310,6 @@ def test_df_index_columns_to_frame(native_df, index, name): check_index_type=False, check_column_type=False, ) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_df.columns.to_frame(index=index, name=name), - native_df.columns.to_frame(index=index, name=name), - check_index_type=False, - check_column_type=False, - ) @sql_count_checker(query_count=0) @@ -336,16 +330,14 @@ def test_df_index_columns_dtype(native_df): @pytest.mark.parametrize("index", NATIVE_INDEX_UNIQUE_TEST_DATA) -@pytest.mark.parametrize("is_lazy", [True, False]) -def test_is_unique(index, is_lazy): - with SqlCounter(query_count=int(is_lazy)): - snow_index = pd.Index(index, convert_to_lazy=is_lazy) +def test_is_unique(index): + with SqlCounter(query_count=1): + snow_index = pd.Index(index) assert index.is_unique == snow_index.is_unique @pytest.mark.parametrize("index", NATIVE_INDEX_UNIQUE_TEST_DATA) -@pytest.mark.parametrize("is_lazy", [True, False]) -def test_has_duplicates(index, is_lazy): - with SqlCounter(query_count=int(is_lazy)): - snow_index = pd.Index(index, convert_to_lazy=is_lazy) +def test_has_duplicates(index): + with SqlCounter(query_count=1): + snow_index = pd.Index(index) assert index.has_duplicates == snow_index.has_duplicates diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 74f860a5693..6091da8dd3a 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -252,7 +252,7 @@ def test_column_index_names(pandas_label): def test_to_pandas_column_index_names(name): df = pd.DataFrame( data=[[1] * 2, [2] * 2], - columns=pd.Index([1, 2], name=name, convert_to_lazy=False), + columns=native_pd.Index([1, 2], name=name), ) assert df.columns.names == [name] pdf = df.to_pandas() diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index 3dcab58bc7e..6ec630024a3 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -194,7 +194,7 @@ def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, native_pd.DataFrame] and isinstance(native_kw_args["columns"], native_pd.Index) and not isinstance(native_kw_args["columns"], pd.MultiIndex) ): - kwargs["columns"] = pd.Index(native_kw_args["columns"], convert_to_lazy=False) + kwargs["columns"] = native_pd.Index(native_kw_args["columns"]) return (pd.DataFrame(*args, **kwargs), native_pd.DataFrame(*args, **native_kw_args)) diff --git a/tests/unit/modin/test_type_annotations.py b/tests/unit/modin/test_type_annotations.py index 63a89d1dfbb..975e784f05f 100644 --- a/tests/unit/modin/test_type_annotations.py +++ b/tests/unit/modin/test_type_annotations.py @@ -5,6 +5,7 @@ from typing import get_type_hints import modin.pandas as pd +import pandas as native_pd import pytest import snowflake.snowpark.modin.plugin # noqa: F401 @@ -14,7 +15,7 @@ "method,type_hints", [ (pd.Series.empty.fget, {"return": bool}), - (pd.DataFrame.columns.fget, {"return": pd.Index}), + (pd.DataFrame.columns.fget, {"return": native_pd.Index}), ], ) def test_properties_snow_1374293(method, type_hints):