diff --git a/CHANGELOG.md b/CHANGELOG.md index 215d8d99804..3f345fafc81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - Added support for `DataFrame.corr`. - Added support for `limit` parameter when `method` parameter is used in `fillna`. - Added support for `DataFrame.equals` and `Series.equals`. +- Added support for `DataFrame.reindex` and `Series.reindex`. #### Bug Fixes - Fixed an issue when using np.where and df.where when the scalar 'other' is the literal 0. diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 6bb3728eb64..97f3303ffe2 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -330,7 +330,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdiv`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | N | | | +| ``reindex`` | P | | ``N`` if axis is MultiIndex or method is | +| | | | ``nearest``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 41bbf084e2d..bac00cb26f6 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -322,7 +322,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdivmod`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | N | | | +| ``reindex`` | P | | ``N`` if the series has MultiIndex, or method | +| | | | is ``nearest``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 3beb95948ee..a06ea2d8ede 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -2584,7 +2584,6 @@ def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 pass return ensure_index(index_like) - @base_not_implemented() def reindex( self, index=None, @@ -2596,7 +2595,10 @@ def reindex( Conform `BasePandasDataset` to new index with optional filling logic. """ # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - + if kwargs.get("limit", None) is not None and kwargs.get("method", None) is None: + raise ValueError( + "limit argument only valid if doing pad, backfill or nearest reindexing" + ) new_query_compiler = None if index is not None: if not isinstance(index, pandas.Index) or not index.equals(self.index): diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 5265b8a910c..a675f3673eb 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -2073,7 +2073,6 @@ def rename( new_query_compiler=new_qc, inplace=inplace ) - @dataframe_not_implemented() def reindex( self, labels=None, diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 4549539eb0d..5ff5f98e711 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -1672,10 +1672,12 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 def reindex(self, *args, **kwargs): if args: if len(args) > 1: - raise TypeError("Only one positional argument ('index') is allowed") + raise TypeError( + "Series.reindex() takes from 1 to 2 positional arguments but 3 were given" + ) if "index" in kwargs: raise TypeError( - "'index' passed as both positional and keyword argument" + "Series.reindex() got multiple values for argument 'index'" ) kwargs.update({"index": args[0]}) index = kwargs.pop("index", None) @@ -1685,10 +1687,11 @@ def reindex(self, *args, **kwargs): limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) + kwargs.pop("axis", None) if kwargs: raise TypeError( - "reindex() got an unexpected keyword " - + f'argument "{list(kwargs.keys())[0]}"' + "Series.reindex() got an unexpected keyword " + + f"argument '{list(kwargs.keys())[0]}'" ) return super().reindex( index=index, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index ff0aa85cdc3..e234d9d2ff1 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2176,6 +2176,306 @@ def any( False, "any", axis=axis, _bool_only=bool_only, skipna=skipna ) + def reindex( + self, + axis: int, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new index along specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to align labels along. 0 is for index, 1 is for columns. + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with aligned axis. + """ + if self.is_multiindex(axis=axis): + raise NotImplementedError( + "Snowpark pandas doesn't support `reindex` with MultiIndex" + ) + if axis == 0: + return self._reindex_axis_0(labels=labels, **kwargs) + else: + return self._reindex_axis_1(labels=labels, **kwargs) + + def _reindex_axis_0( + self, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new index. + + Parameters + ---------- + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with aligned axis. + """ + new_index_qc = pd.Series(labels)._query_compiler + new_index_modin_frame = new_index_qc._modin_frame + modin_frame = self._modin_frame + method = kwargs.get("method", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] + _filter_column_snowflake_quoted_id = None + if fill_value is not np.nan or method: + # If we are filling values, reindex ignores NaN values that + # were previously present in the DataFrame before reindexing. + # In order to differentiate between pre-existing NaN values, + # and new NaN values caused by new index values that are not + # present, we can attach a boolean column of all `True`'s to + # self's modin_frame. After the left join with the new index + # rows that were present in self will have a True value, while + # rows that were not present in self will have a NA value. We can + # filter by which rows have an NA value for the dummy column to determine + # between pre-existing NaN's, and NaN's that were introduced because of new + # values in the index that are not present in the old index. If a row + # has a True value for the dummy column, any NaN's in it should be ignored + # as it is a pre-existing NaN value that we **should not** fill. + modin_frame = modin_frame.append_column( + "dummy_reindex_column_for_fill", pandas_lit(True) + ) + _filter_column_snowflake_quoted_id = ( + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ) + result_frame, result_frame_column_mapper = join_utils.join( + new_index_modin_frame, + modin_frame, + how="left", + left_on=new_index_modin_frame.data_column_snowflake_quoted_identifiers, + right_on=modin_frame.index_column_snowflake_quoted_identifiers, + ) + data_column_pandas_labels = modin_frame.data_column_pandas_labels + data_column_snowflake_quoted_identifiers = ( + result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.data_column_snowflake_quoted_identifiers + ) + ) + new_modin_frame = InternalFrame.create( + ordered_dataframe=result_frame.ordered_dataframe, + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( + new_index_modin_frame.data_column_snowflake_quoted_identifiers + ), + ) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + if method or fill_value is not np.nan: + limit = kwargs.get("limit", None) + new_filter_column_snowflake_quoted_id = ( + result_frame_column_mapper.map_right_quoted_identifiers( + [_filter_column_snowflake_quoted_id] + )[0] + ) + ( + new_modin_frame, + mapper, + ) = new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( + { + new_filter_column_snowflake_quoted_id: coalesce( + col(new_filter_column_snowflake_quoted_id), + pandas_lit(False), + ) + } + ) + new_filter_column_snowflake_quoted_id = mapper[ + new_filter_column_snowflake_quoted_id + ] + if method not in ["nearest", None]: + new_qc = SnowflakeQueryCompiler( + new_modin_frame.ensure_row_position_column() + ) + ordering_column = ( + new_qc._modin_frame.row_position_snowflake_quoted_identifier + ) + new_qc = new_qc.sort_rows_by_column_values( + columns=new_modin_frame.index_column_pandas_labels, + ascending=[True], + kind="stable", + na_position="last", + ignore_index=False, + ) + new_qc = new_qc._fillna_with_masking( + self_is_series=False, + method=method, + limit=limit, # type: ignore[arg-type] + axis=0, + row_mask_snowflake_quoted_identifier=new_filter_column_snowflake_quoted_id, + ) + new_ordered_frame = new_qc._modin_frame.ordered_dataframe.sort( + OrderingColumn(snowflake_quoted_identifier=ordering_column) + ) + new_ordered_frame.row_position_snowflake_quoted_identifier = ( + ordering_column + ) + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_ordered_frame, + data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ], + data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, + ) + ) + if fill_value is not np.nan: + new_qc = new_qc._fillna_with_masking( + self_is_series=False, + value=fill_value, + axis=0, + row_mask_snowflake_quoted_identifier=new_filter_column_snowflake_quoted_id, + ) + if method in ["nearest", None]: + # In this case, we haven't removed the dummy column that tells us which NA values + # should not be replaced. + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_qc._modin_frame.ordered_dataframe, + data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ], + data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, + ) + ) + + return new_qc + + def _reindex_axis_1( + self, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new column. + + Parameters + ---------- + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with aligned axis. + """ + method = kwargs.get("method", None) + level = kwargs.get("level", None) + limit = kwargs.get("limit", None) + tolerance = kwargs.get("tolerance", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] + # Currently, our error checking relies on the column axis being eager (i.e. stored + # locally as a pandas Index, rather than pushed down to the database). This allows + # us to have parity with native pandas for things like monotonicity checks. If + # our columns are no longer eagerly stored, we would no longer be able to rely + # on pandas for these error checks, and the behaviour of reindex would change. + # This change is user-facing, so we should catch this in CI first, which we can + # by having this assert here, as a sentinel. + assert ( + not self.columns.is_lazy + ), "`reindex` with axis=1 failed on error checking." + self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) + data_column_pandas_labels = [] + data_column_snowflake_quoted_identifiers = [] + modin_frame = self._modin_frame + for label in labels: + data_column_pandas_labels += [label] + if label in self._modin_frame.data_column_pandas_labels: + snowflake_quoted_ids = list( + modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + [label], include_index=False + )[0] + ) + data_column_snowflake_quoted_identifiers += snowflake_quoted_ids + if len(snowflake_quoted_ids) > 1: + data_column_pandas_labels += [label] * ( + len(snowflake_quoted_ids) - 1 + ) + else: + modin_frame = modin_frame.append_column(label, pandas_lit(np.nan)) + data_column_snowflake_quoted_identifiers += [ + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ] + new_modin_frame = InternalFrame.create( + ordered_dataframe=modin_frame.ordered_dataframe, + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, + ) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + ordered_columns = sorted(data_column_pandas_labels) + columns_to_ignore = [ + c in self._modin_frame.data_column_pandas_labels for c in ordered_columns + ] + if method not in ["nearest", None]: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + )._fillna_with_masking( + method=method, limit=limit, columns_mask=columns_to_ignore, self_is_series=False # type: ignore[arg-type] + ) + if fill_value is not np.nan: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + )._fillna_with_masking( + value=fill_value, + columns_mask=columns_to_ignore, + self_is_series=False, + ) + if method not in ["nearest", None] or fill_value is not np.nan: + # We only need to reorder the columns if we sorted them above for filling. + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=data_column_pandas_labels + ) + return new_qc + def _parse_names_arguments_from_reset_index( self, names: IndexLabel, @@ -8873,6 +9173,7 @@ def _make_fill_expression_for_column_wise_fillna( snowflake_quoted_identifier: str, method: FillNAMethod, limit: Optional[int] = None, + _columns_to_ignore: Optional[list[bool]] = None, ) -> SnowparkColumn: """ Helper function to get the Snowpark Column expression corresponding to snowflake_quoted_id when doing a column wise fillna. @@ -8909,21 +9210,67 @@ def _make_fill_expression_for_column_wise_fillna( start_pos = 0 if limit is not None: start_pos = max(col_pos - limit, start_pos) + columns_to_include = ( + self._modin_frame.data_column_snowflake_quoted_identifiers[ + start_pos:col_pos + ][::-1] + ) + if _columns_to_ignore: + # When _colums_to_ignore is set, we are using this to perform a column-wise fill for reindex. + # In that case, we will do two things: + # 1. We must filter so that the only columns that appear in the coalesce are columns that + # were previously a part of the dataframe (filter using the booleans in _columns_to_ignore). + # 2. We must propagate NA values from existing columns (so if we call ffill, and we are filling + # new column 'C', and old column 'A' has value 4 and old column 'B' has value NaN, we must fill + # column 'C' with NaN, unlike with standard fillna, where we would propagate 4.) + # + # Now, columns_to_include includes the columns that we can use to fill the value in this column; + # however, since we want to propagate NA values, we can find the first column in the list that is + # an old column, and only pass that in to coalesce, so that if there is a NaN, it will be propagated, + # and if there isn't, the correct value will be propagated. + column = None + for col_name, ignore_bool in zip( + columns_to_include, _columns_to_ignore[start_pos:col_pos][::-1] + ): + if ignore_bool: + # This means that this is a column from the original data. + column = col_name + break + if column is None: + columns_to_include = columns_to_include[0:1] + else: + columns_to_include = [column] return coalesce( snowflake_quoted_identifier, - *self._modin_frame.data_column_snowflake_quoted_identifiers[ - start_pos:col_pos - ][::-1], + *columns_to_include, ) else: - start_pos = len_ids + end_pos = len_ids if limit is not None: - start_pos = min(col_pos + limit, len_ids) + # Add 1 since end index is exclusive. + end_pos = min(col_pos + limit + 1, len_ids) + columns_to_include = ( + self._modin_frame.data_column_snowflake_quoted_identifiers[ + col_pos:end_pos + ] + ) + if _columns_to_ignore: + column = None + for col_name, ignore_bool in zip( + columns_to_include, _columns_to_ignore[col_pos:end_pos] + ): + if ignore_bool: + # This means that this is a column from the original data. + column = col_name + break + if column is None: + columns_to_include = columns_to_include[0:1] + else: + columns_to_include = [column] + return coalesce( snowflake_quoted_identifier, - *self._modin_frame.data_column_snowflake_quoted_identifiers[ - start_pos:col_pos:-1 - ][::-1], + *columns_to_include, ) def fillna( @@ -8951,7 +9298,48 @@ def fillna( Returns ------- - BaseQueryCompiler + SnowflakeQueryCompiler + New QueryCompiler with all null values filled. + """ + return self._fillna_with_masking( + value=value, + self_is_series=self_is_series, + method=method, + axis=axis, + limit=limit, + downcast=downcast, + ) + + def _fillna_with_masking( + self, + value: Optional[Union[Hashable, Mapping, "pd.DataFrame", "pd.Series"]] = None, + *, + self_is_series: bool, + method: Optional[FillnaOptions] = None, + axis: Optional[Axis] = None, + limit: Optional[int] = None, + downcast: Optional[dict] = None, + row_mask_snowflake_quoted_identifier: Optional[str] = None, + columns_mask: Optional[list[bool]] = None, + ) -> "SnowflakeQueryCompiler": + """ + Replace NaN values using provided method. + + Parameters + ---------- + value : scalar or dict + method : {"backfill", "bfill", "pad", "ffill", None} + axis : {0, 1} + limit : int, optional + downcast : dict, optional + row_mask_snowflake_quoted_identifier : str, optional + columns_mask : list[bool], optional + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler New QueryCompiler with all null values filled. """ if value is not None and limit is not None: @@ -8991,8 +9379,9 @@ def fillna( window_end = Window.UNBOUNDED_FOLLOWING else: window_end = limit - fillna_column_map = { - snowflake_quoted_id: coalesce( + + def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: + return coalesce( snowflake_quoted_id, func(snowflake_quoted_id, ignore_nulls=True).over( Window.order_by( @@ -9000,15 +9389,49 @@ def fillna( ).rows_between(window_start, window_end) ), ) - for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers - } + + if row_mask_snowflake_quoted_identifier: + # This is an internal argument passed in by reindex + # that specifies a column to filter on when doing fillna + # columns that this filter is True for should have their + # NA values ignored. + fillna_column_map = { + snowflake_quoted_id: iff( + col(row_mask_snowflake_quoted_identifier), + col(snowflake_quoted_id), + fillna_expr(snowflake_quoted_id), + ) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } + + else: + fillna_column_map = { + snowflake_quoted_id: fillna_expr(snowflake_quoted_id) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } else: - fillna_column_map = { - snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( - snowflake_quoted_id, method, limit=limit - ) - for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers - } + if columns_mask is None: + fillna_column_map = { + snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( + snowflake_quoted_id, + method, + limit=limit, + ) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } + else: + fillna_column_map = { + snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( + snowflake_quoted_id, + method, + limit=limit, + _columns_to_ignore=columns_mask, + ) + for i, snowflake_quoted_id in enumerate( + self._modin_frame.data_column_snowflake_quoted_identifiers + ) + if not columns_mask[i] + } # case 3: fillna with a mapping else: # we create a mapping from column label to the fillin value and use coalesce to implement fillna @@ -9044,10 +9467,25 @@ def fillna( include_index=False, ) fillna_column_map = {} + if columns_mask is not None: + columns_to_ignore = itertools.compress( + self._modin_frame.data_column_pandas_labels, + columns_mask, + ) + else: + columns_to_ignore = [] # type:ignore [assignment] for label, id_tuple in zip(labels, id_tuples): - for id in id_tuple: - val = label_to_value_map[label] - fillna_column_map[id] = coalesce(id, pandas_lit(val)) + if label not in columns_to_ignore: + for id in id_tuple: + val = label_to_value_map[label] + if row_mask_snowflake_quoted_identifier is None: + fillna_column_map[id] = coalesce(id, pandas_lit(val)) + else: + fillna_column_map[id] = iff( + col(row_mask_snowflake_quoted_identifier), + col(id), + coalesce(id, pandas_lit(val)), + ) return SnowflakeQueryCompiler( self._modin_frame.update_snowflake_quoted_identifiers_with_expressions( diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 27fb4b0cd44..e4373ed5cf6 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -2785,7 +2785,210 @@ def rename(): """ def reindex(): - pass + """ + Conform DataFrame to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object is produced + unless the new index is equivalent to the current one and copy=False. + + Parameters + ---------- + index : array-like, optional + New labels for the index. + axis : int or str, optional + Unused. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"}, default: None + Method to use for filling holes in reindexed DataFrame. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. Unsupported by Snowpark pandas. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + level : int or name + Broadcast across a level, matching Index values on the passed MultiIndex level. + + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any “compatible” value. + + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + + tolerance : optional + Maximum distance between original and new labels for inexact matches. + The values of the index at the matching locations most satisfy the + equation abs(index[indexer] - target) <= tolerance. Unsupported by + Snowpark pandas. + + Returns + ------- + DataFrame + DataFrame with changed index. + + Notes + ----- + For axis 0, Snowpark pandas' behaviour diverges from vanilla pandas in order + to maintain Snowpark's lazy execution paradigm. The behaviour changes are as follows: + + * Snowpark pandas does not error if the existing index is not monotonically increasing + or decreasing when `method` is specified for filling. It instead assumes that + the index is monotonically increasing, performs the reindex, and fills the values + as though the index is sorted (which involves sorting internally). + * Snowpark pandas does not error out if there are duplicates - they are included in the + output. + * Snowpark pandas does not error if a `limit` value is passed and the new index is not + monotonically increasing or decreasing - instead, it reindexes, sorts the new index, + fills using limit, and then reorders the data to be in the correct order (the order + of the target labels passed in to the method). + + For axis 1, Snowpark pandas' error checking remains the same as vanilla pandas. + + MultiIndex is currently unsupported. + + ``method="nearest"`` is currently unsupported. + + Examples + -------- + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + >>> df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default, values in the new index + that do not have corresponding records in the dataframe are assigned NaN. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + >>> df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to the keyword fill_value. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value=-1) # doctest: +NORMALIZE_WHITESPACE + http_status response_time + Safari 404 0.07 + Iceweasel -1 -1.00 + Comodo Dragon -1 -1.00 + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None + + Or we can use “axis-style” keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None + + To further illustrate the filling functionality in reindex, we will create a dataframe + with a monotonically increasing index (for example, a sequence of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + >>> df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame (for example, + ``2009-12-29``) are by default filled with NaN. If desired, we can fill in the missing + values using one of several options. + + For example, to back-propagate the last valid value to fill the NaN values, pass bfill as + an argument to the method keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the NaN value present in the original dataframe (at index value 2010-01-03) will + not be filled by any of the value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and desired indexes. If you do + want to fill in the NaN values present in the original dataframe, use the fillna() method. + + An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. + >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) + >>> unordered_dataframe + A B C + 5 5 5 5 + 8 8 8 8 + 6 6 6 6 + >>> unordered_dataframe.reindex([6, 8, 7], method="ffill") + A B C + 6 6 6 6 + 8 8 8 8 + 7 6 6 6 + + In the example above, index value ``7`` is forward filled from index value ``6``, since that + is the previous index value when the data is sorted. + """ def replace(): """ diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 576e5bd2f1e..65af39dec1c 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -1976,7 +1976,210 @@ def ravel(): """ def reindex(): - pass + """ + Conform Series to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object is produced + unless the new index is equivalent to the current one and copy=False. + + Parameters + ---------- + index : array-like, optional + New labels for the index. + axis : int or str, optional + Unused. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"}, default: None + Method to use for filling holes in reindexed DataFrame. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. Unsupported by Snowpark pandas. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + level : int or name + Broadcast across a level, matching Index values on the passed MultiIndex level. + + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any “compatible” value. + + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + + tolerance : optional + Maximum distance between original and new labels for inexact matches. + The values of the index at the matching locations most satisfy the + equation abs(index[indexer] - target) <= tolerance. Unsupported by + Snowpark pandas. + + Returns + ------- + Series + Series with changed index. + + Notes + ----- + For axis 0, Snowpark pandas' behaviour diverges from vanilla pandas in order + to maintain Snowpark's lazy execution paradigm. The behaviour changes are as follows: + + * Snowpark pandas does not error if the existing index is not monotonically increasing + or decreasing when `method` is specified for filling. It instead assumes that + the index is monotonically increasing, performs the reindex, and fills the values + as though the index is sorted (which involves sorting internally). + * Snowpark pandas does not error out if there are duplicates - they are included in the + output. + * Snowpark pandas does not error if a `limit` value is passed and the new index is not + monotonically increasing or decreasing - instead, it reindexes, sorts the new index, + fills using limit, and then reorders the data to be in the correct order (the order + of the target labels passed in to the method). + + For axis 1, Snowpark pandas' error checking remains the same as vanilla pandas. + + MultiIndex is currently unsupported. + + ``method="nearest"`` is currently unsupported. + + Examples + -------- + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + >>> df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default, values in the new index + that do not have corresponding records in the dataframe are assigned NaN. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + >>> df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to the keyword fill_value. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value=-1) # doctest: +NORMALIZE_WHITESPACE + http_status response_time + Safari 404 0.07 + Iceweasel -1 -1.00 + Comodo Dragon -1 -1.00 + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None + + Or we can use “axis-style” keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None + + To further illustrate the filling functionality in reindex, we will create a dataframe + with a monotonically increasing index (for example, a sequence of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + >>> df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + >>> df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame (for example, + ``2009-12-29``) are by default filled with NaN. If desired, we can fill in the missing + values using one of several options. + + For example, to back-propagate the last valid value to fill the NaN values, pass bfill as + an argument to the method keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the NaN value present in the original dataframe (at index value 2010-01-03) will + not be filled by any of the value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and desired indexes. If you do + want to fill in the NaN values present in the original dataframe, use the fillna() method. + + An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. + >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) + >>> unordered_dataframe + A B C + 5 5 5 5 + 8 8 8 8 + 6 6 6 6 + >>> unordered_dataframe.reindex([6, 8, 7], method="ffill") + A B C + 6 6 6 6 + 8 8 8 8 + 7 6 6 6 + + In the example above, index value ``7`` is forward filled from index value ``6``, since that + is the previous index value when the data is sorted. + """ def rename_axis(): """ diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py new file mode 100644 index 00000000000..692fd66471f --- /dev/null +++ b/tests/integ/modin/frame/test_reindex.py @@ -0,0 +1,571 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, + eval_snowpark_pandas_result, +) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("axis", [0, 1]) +def test_reindex_invalid_limit_parameter(axis): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), index=list("ABC"), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(axis=axis, labels=list("CAB"), fill_value=1, limit=1), + expect_exception=True, + expect_exception_match="limit argument only valid if doing pad, backfill or nearest reindexing", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + +class TestReindexAxis0: + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_reorder(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CAB")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_add_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CABDEF")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_remove_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CA")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_add_remove_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CADEFG")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_fill_value(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("CADEFG"), fill_value=-1.0), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_index_fill_method(self, method, limit): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ACE")) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_index_fill_method_pandas_negative(self, method, limit): + # The negative is because pandas does not support `limit` parameter + # if the index and target are not both monotonic, while we do. + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CADEFG"), method=method, limit=limit) + else: + return df.reindex( + index=list("ACDEFG"), method=method, limit=limit + ).reindex(list("CADEFG")) + + eval_snowpark_pandas_result( + snow_df, + native_df, + perform_reindex, + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_ordered_index_unordered_new_index(self): + ordered_native_dataframe = native_pd.DataFrame( + [[5] * 3, [6] * 3, [8] * 3], columns=list("ABC"), index=[5, 6, 8] + ) + ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) + eval_snowpark_pandas_result( + ordered_snow_dataframe, + ordered_native_dataframe, + lambda df: df.reindex(index=[6, 8, 7], method="ffill"), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_fill_value_with_old_na_values(self): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("CEBFGA"), fill_value=-1), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.xfail(reason="Cannot qualify window in fillna.") + def test_reindex_index_fill_method_with_old_na_values(self, limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.xfail(reason="Cannot qualify window in fillna.") + def test_reindex_index_fill_method_with_old_na_values_pandas_negative( + self, limit, method + ): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CEBFGA"), method=method, limit=limit) + else: + return df.reindex( + index=list("ABCEFG"), method=method, limit=limit + ).reindex(list("CEBFGA")) + + eval_snowpark_pandas_result( + snow_df, + native_df, + perform_reindex, + ) + + @sql_count_checker(query_count=2, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_index_datetime_with_fill(self, limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return df.reindex( + native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_non_overlapping_index(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) + ) + + @sql_count_checker(query_count=2, join_count=1) + def test_reindex_index_non_overlapping_datetime_index(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return df.reindex( + native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1) + def test_reindex_index_non_overlapping_different_types_index_negative(self): + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + with pytest.raises( + SnowparkSQLException, match=".*Timestamp 'A' is not recognized" + ): + snow_df.reindex(list("ABC")).to_pandas() + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize( + "new_index", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] + ) + def test_reindex_index_duplicate_values(self, new_index): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABA")) + snow_df = pd.DataFrame(native_df) + + with pytest.raises( + ValueError, match="cannot reindex on an axis with duplicate labels" + ): + native_df.reindex(index=new_index) + + dfs_to_concat = [] + for row in new_index: + if row not in list("AB"): + dfs_to_concat.append(native_pd.DataFrame([[np.nan] * 3], index=[row])) + else: + value = native_df.loc[row] + if isinstance(value, native_pd.Series): + value = native_pd.DataFrame(value).T + dfs_to_concat.append(value) + result_native_df = native_pd.concat(dfs_to_concat) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_df.reindex(index=new_index), result_native_df + ) + + +class TestReindexAxis1: + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_reorder(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CAB")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_add_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CABDEF")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_remove_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CA")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_add_remove_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(columns=list("CADEFG")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_fill_value(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CADEFG"), fill_value=-1.0), + ) + + @sql_count_checker(query_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_columns_fill_method(self, method, limit): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_ordered_columns_unordered_new_columns(self): + ordered_native_dataframe = native_pd.DataFrame( + np.array([[5] * 3, [6] * 3, [8] * 3]).T, + index=list("ABC"), + columns=[5, 6, 8], + ) + ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) + eval_snowpark_pandas_result( + ordered_snow_dataframe, + ordered_native_dataframe, + lambda df: df.reindex(columns=[6, 8, 7], method="ffill"), + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_fill_value_with_old_na_values(self): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CEBFGA"), fill_value=-1), + ) + + @sql_count_checker(query_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_fill_method_with_old_na_values(self, limit, method): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("ABCDEFGH"), method=method, limit=limit), + ) + + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_fill_method_with_old_na_values_negative( + self, limit, method + ): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + match_str = f"limit argument for '{method_str}' method only well-defined if index and target are monotonic" + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex( + columns=list("CEBFGA"), method=method, limit=limit + ), + expect_exception=True, + assert_exception_equal=True, + expect_exception_match=match_str, + expect_exception_type=ValueError, + ) + else: + with SqlCounter(query_count=1): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CEBFGA"), method=method), + ) + + @sql_count_checker(query_count=5) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_datetime_with_fill(self, limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + columns=pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return df.reindex( + columns=native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_non_overlapping_columns(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("EFG")) + ) + + @sql_count_checker(query_count=5) + def test_reindex_columns_non_overlapping_datetime_columns(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + columns=pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return df.reindex( + columns=native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=2) + def test_reindex_columns_non_overlapping_different_types_columns(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(columns=list("ABCD")) + ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize( + "new_columns", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] + ) + def test_reindex_columns_duplicate_values(self, new_columns): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABA") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=new_columns), + expect_exception=True, + expect_exception_match="cannot reindex on an axis with duplicate labels", + assert_exception_equal=True, + expect_exception_type=ValueError, + ) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("axis", [0, 1]) +def test_reindex_multiindex_negative(axis): + snow_df = pd.DataFrame(np.arange(9).reshape((3, 3))) + snow_df = snow_df.set_index([0, 1]) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't support `reindex` with MultiIndex", + ): + if axis == 0: + snow_df.reindex(index=[1, 2, 3]) + else: + snow_df.T.reindex(columns=[1, 2, 3]) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py new file mode 100644 index 00000000000..7c2bbba906e --- /dev/null +++ b/tests/integ/modin/series/test_reindex.py @@ -0,0 +1,378 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import re + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, + eval_snowpark_pandas_result, +) + + +@sql_count_checker(query_count=0) +def test_reindex_invalid_limit_parameter(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CAB"), fill_value=1, limit=1), + expect_exception=True, + expect_exception_match="limit argument only valid if doing pad, backfill or nearest reindexing", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_invalid_axis_parameter_ignored(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(axis=1, index=list("CAB")), + ) + + +@sql_count_checker(query_count=0) +def test_reindex_invalid_labels_parameter(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(labels=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape("got an unexpected keyword argument 'labels'"), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + +@sql_count_checker(query_count=0) +def test_reindex_index_passed_twice(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(list("CAB"), index=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape("got multiple values for argument 'index'"), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + +@sql_count_checker(query_count=0) +def test_reindex_multiple_args_passed(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(list("CAB"), index=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape("got multiple values for argument 'index'"), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_reorder(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CAB")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CABDEF")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_remove_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CA")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_remove_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CADEFG")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CADEFG"), fill_value=-1.0), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +def test_reindex_index_fill_method(method, limit): + native_series = native_pd.Series([0, 1, 2], index=list("ACE")) + snow_series = pd.Series(native_series) + + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex( + index=list("ABCDEFGH"), method=method, limit=limit + ), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +def test_reindex_index_fill_method_pandas_negative(method, limit): + # The negative is because pandas does not support `limit` parameter + # if the index and target are not both monotonic, while we do. + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_series.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex(index=list("CADEFG"), method=method, limit=limit) + else: + return series.reindex( + index=list("ACDEFG"), method=method, limit=limit + ).reindex(list("CADEFG")) + + eval_snowpark_pandas_result( + snow_series, + native_series, + perform_reindex, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_ordered_index_unordered_new_index(): + ordered_native_Series = native_pd.Series([5, 6, 8], index=[5, 6, 8]) + ordered_snow_series = pd.Series(ordered_native_Series) + eval_snowpark_pandas_result( + ordered_snow_series, + ordered_native_Series, + lambda series: series.reindex(index=[6, 8, 7], method="ffill"), + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value_with_old_na_values(): + native_series = native_pd.Series([1, np.nan, 3], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CEBFGA"), fill_value=-1), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.xfail(reason="Cannot qualify window in fillna.") +def test_reindex_index_fill_method_with_old_na_values(limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original Series + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original Series. + native_series = native_pd.Series([1, np.nan, 3], index=list("ACE")) + snow_series = pd.Series(native_series) + + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex( + index=list("ABCDEFGH"), method=method, limit=limit + ), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_fill_method_with_old_na_values_pandas_negative(limit, method): + native_series = native_pd.Series([1, np.nan, 3], index=list("ABC")) + snow_series = pd.Series(native_series) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_series.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex(index=list("CEBFGA"), method=method, limit=limit) + else: + return series.reindex( + index=list("ABCEFG"), method=method, limit=limit + ).reindex(list("CEBFGA")) + + eval_snowpark_pandas_result( + snow_series, + native_series, + perform_reindex, + ) + + +@sql_count_checker(query_count=2, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_datetime_with_fill(limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_series = native_pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex( + pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return series.reindex( + native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_series, native_series, perform_reindex, check_freq=False + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_non_overlapping_index(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("EFG")) + ) + + +@sql_count_checker(query_count=2, join_count=1) +def test_reindex_index_non_overlapping_datetime_index(): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_series = native_pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex( + pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return series.reindex( + native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_series, native_series, perform_reindex, check_freq=False + ) + + +@sql_count_checker(query_count=1) +def test_reindex_index_non_overlapping_different_types_index_negative(): + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + with pytest.raises(SnowparkSQLException, match=".*Timestamp 'A' is not recognized"): + snow_series.reindex(list("ABC")).to_pandas() + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize( + "new_index", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] +) +def test_reindex_index_duplicate_values(new_index): + native_series = native_pd.Series([0, 1, 2], index=list("ABA")) + snow_series = pd.Series(native_series) + + with pytest.raises( + ValueError, match="cannot reindex on an axis with duplicate labels" + ): + native_series.reindex(index=new_index) + + series_to_concat = [] + for row in new_index: + if row not in list("AB"): + series_to_concat.append(native_pd.Series([np.nan], index=[row])) + else: + value = native_series.loc[row] + if not isinstance(value, native_pd.Series): + value = native_pd.Series([value], index=[row]) + series_to_concat.append(value) + result_native_series = native_pd.concat(series_to_concat) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_series.reindex(index=new_index), result_native_series + ) + + +@sql_count_checker(query_count=0) +def test_reindex_multiindex_negative(): + snow_series = pd.Series( + [0, 1, 2], index=native_pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)]) + ) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't support `reindex` with MultiIndex", + ): + snow_series.reindex(index=[1, 2, 3])