From c0dca708c343b187a07e6a47b67c9d592fec65de Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 9 Jul 2024 16:23:19 -0700 Subject: [PATCH 01/20] [SNOW-1524894]: Add support for `reindex` --- src/snowflake/snowpark/modin/pandas/base.py | 1 - .../snowpark/modin/pandas/dataframe.py | 1 - .../compiler/snowflake_query_compiler.py | 55 +++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 3beb95948ee..8d06ecb5993 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -2584,7 +2584,6 @@ def _ensure_index(self, index_like, axis=0): # noqa: PR01, RT01, D200 pass return ensure_index(index_like) - @base_not_implemented() def reindex( self, index=None, diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 919016830c9..e98aaec9a0a 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -2068,7 +2068,6 @@ def rename( new_query_compiler=new_qc, inplace=inplace ) - @dataframe_not_implemented() def reindex( self, labels=None, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index e9f7f05dcb5..53e2ce7e0bc 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2176,6 +2176,61 @@ def any( False, "any", axis=axis, _bool_only=bool_only, skipna=skipna ) + def reindex( + self, + axis: int, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new index along specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to align labels along. 0 is for index, 1 is for columns. + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + BaseQueryCompiler + QueryCompiler with aligned axis. + """ + if axis == 0: + new_index_qc = pd.Series(labels)._query_compiler + new_index_modin_frame = new_index_qc._modin_frame + modin_frame = self._modin_frame + result_frame, result_frame_column_mapper = join_utils.join( + new_index_modin_frame, + modin_frame, + how="left", + left_on=new_index_modin_frame.data_column_snowflake_quoted_identifiers, + right_on=modin_frame.index_column_snowflake_quoted_identifiers, + ) + new_modin_frame = InternalFrame.create( + ordered_dataframe=result_frame.ordered_dataframe, + data_column_pandas_labels=modin_frame.data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.data_column_snowflake_quoted_identifiers + ), + data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( + new_index_modin_frame.data_column_snowflake_quoted_identifiers + ), + ) + return SnowflakeQueryCompiler(new_modin_frame) + return self + def _parse_names_arguments_from_reset_index( self, names: IndexLabel, From 669954bc62ee67d676b7f32dd5f1529b42e37517 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 9 Jul 2024 20:35:18 -0700 Subject: [PATCH 02/20] Add fill logic for index --- .../compiler/snowflake_query_compiler.py | 155 ++++++++++++++++-- 1 file changed, 145 insertions(+), 10 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 53e2ce7e0bc..9f9868d337a 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2209,6 +2209,29 @@ def reindex( new_index_qc = pd.Series(labels)._query_compiler new_index_modin_frame = new_index_qc._modin_frame modin_frame = self._modin_frame + method = kwargs.get("method", None) + fill_value = kwargs.get("fill_value", None) + _filter_column_snowflake_quoted_id = None + if fill_value is not np.nan or method: + # If we are filling values, reindex ignores NaN values that + # were previously present in the DataFrame before reindexing. + # In order to differentiate between pre-existing NaN values, + # and new NaN values caused by new index values that are not + # present, we can attach a boolean column of all `True`'s to + # self's modin_frame. After the left join with the new index + # rows that were present in self will have a True value, while + # rows that were not present in self will have a NA value. We can + # filter by which rows have an NA value for the dummy column to determine + # between pre-existing NaN's, and NaN's that were introduced because of new + # values in the index that are not present in the old index. If a row + # has a True value for the dummy column, any NaN's in it should be ignored + # as it is a pre-existing NaN value that we **should not** fill. + modin_frame = modin_frame.append_column( + "dummy_reindex_column_for_fill", pandas_lit(True) + ) + _filter_column_snowflake_quoted_id = ( + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ) result_frame, result_frame_column_mapper = join_utils.join( new_index_modin_frame, modin_frame, @@ -2216,19 +2239,107 @@ def reindex( left_on=new_index_modin_frame.data_column_snowflake_quoted_identifiers, right_on=modin_frame.index_column_snowflake_quoted_identifiers, ) + data_column_pandas_labels = modin_frame.data_column_pandas_labels + data_column_snowflake_quoted_identifiers = ( + result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.data_column_snowflake_quoted_identifiers + ) + ) + if method is not None: + data_column_pandas_labels += ["_sort_index"] + data_column_snowflake_quoted_identifiers += ( + result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.index_column_snowflake_quoted_identifiers + ) + ) new_modin_frame = InternalFrame.create( ordered_dataframe=result_frame.ordered_dataframe, - data_column_pandas_labels=modin_frame.data_column_pandas_labels, - data_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_right_quoted_identifiers( - modin_frame.data_column_snowflake_quoted_identifiers - ), + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, index_column_pandas_labels=modin_frame.index_column_pandas_labels, index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( new_index_modin_frame.data_column_snowflake_quoted_identifiers ), ) - return SnowflakeQueryCompiler(new_modin_frame) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + if method or fill_value is not np.nan: + limit = kwargs.get("limit", None) + if method not in ["nearest", None]: + new_filter_column_snowflake_quoted_id = ( + result_frame_column_mapper.map_right_quoted_identifiers( + [_filter_column_snowflake_quoted_id] + )[0] + ) + new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( + { + new_filter_column_snowflake_quoted_id: coalesce( + new_filter_column_snowflake_quoted_id, pandas_lit(False) + ) + } + ) + new_qc = SnowflakeQueryCompiler( + new_modin_frame.ensure_row_position_column() + ) + ordering_column = ( + new_qc._modin_frame.row_position_snowflake_quoted_identifier + ) + new_qc = new_qc.sort_rows_by_column_values( + columns=["_sort_index"], + ascending=[True], + kind="stable", + na_position="last", + ignore_index=False, + ) + new_qc = new_qc.fillna( + self_is_series=False, + method=method, + limit=limit, + _filter_column_snowflake_quoted_id=new_filter_column_snowflake_quoted_id, + ) + new_ordered_frame = new_qc._modin_frame.ordered_dataframe.sort( + OrderingColumn(snowflake_quoted_identifier=ordering_column) + ) + new_ordered_frame.row_position_snowflake_quoted_identifier = ( + ordering_column + ) + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_ordered_frame, + data_column_pandas_labels=modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ] + ), + data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( + new_index_modin_frame.data_column_snowflake_quoted_identifiers + ), + ) + ) + if fill_value is not np.nan: + new_modin_frame = new_qc.fillna( + self_is_series=False, value=fill_value + )._modin_frame + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_modin_frame.ordered_dataframe, + data_column_pandas_labels=new_modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=new_modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ], + data_column_pandas_index_names=new_modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_modin_frame.index_column_snowflake_quoted_identifiers, + ) + ) + return new_qc return self def _parse_names_arguments_from_reset_index( @@ -8893,6 +9004,7 @@ def fillna( axis: Optional[Axis] = None, limit: Optional[int] = None, downcast: Optional[dict] = None, + _filter_column_snowflake_quoted_id=None, ) -> "SnowflakeQueryCompiler": """ Replace NaN values using provided method. @@ -8904,6 +9016,7 @@ def fillna( axis : {0, 1} limit : int, optional downcast : dict, optional + _filter_column_snowflake_quoted_id : str, optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. @@ -8944,14 +9057,36 @@ def fillna( func = first_value window_start = Window.CURRENT_ROW window_end = Window.UNBOUNDED_FOLLOWING - fillna_column_map = { - snowflake_quoted_id: coalesce( - snowflake_quoted_id, - func(snowflake_quoted_id, ignore_nulls=True).over( + expr_maker = None + if _filter_column_snowflake_quoted_id: + # This is an internal argument passed in by reindex + # that specifies a column to filter on when doing fillna + # columns that this filter is True for should have their + # NA values ignored. + def expr_maker(snowflake_quoted_id): + return iff( + not_(col(_filter_column_snowflake_quoted_id)), + col(snowflake_quoted_id), + func(snowflake_quoted_id, ignore_nulls=True).over( + Window.order_by( + self._modin_frame.row_position_snowflake_quoted_identifier + ).rows_between(window_start, window_end) + ), + ) + + else: + + def expr_maker(snowflake_quoted_id): + return func(snowflake_quoted_id, ignore_nulls=True).over( Window.order_by( self._modin_frame.row_position_snowflake_quoted_identifier ).rows_between(window_start, window_end) - ), + ) + + fillna_column_map = { + snowflake_quoted_id: coalesce( + snowflake_quoted_id, + expr_maker(snowflake_quoted_id), ) for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers } From 0bf4a06a177ecf34e6d02da962610d1877b9b080 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 16:25:43 -0700 Subject: [PATCH 03/20] Add support for filling logic when axis=0 and tests --- .../compiler/snowflake_query_compiler.py | 156 +++++++++--------- tests/integ/modin/frame/test_reindex.py | 80 +++++++++ 2 files changed, 162 insertions(+), 74 deletions(-) create mode 100644 tests/integ/modin/frame/test_reindex.py diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 9f9868d337a..12ee8b5accd 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2245,13 +2245,6 @@ def reindex( modin_frame.data_column_snowflake_quoted_identifiers ) ) - if method is not None: - data_column_pandas_labels += ["_sort_index"] - data_column_snowflake_quoted_identifiers += ( - result_frame_column_mapper.map_right_quoted_identifiers( - modin_frame.index_column_snowflake_quoted_identifiers - ) - ) new_modin_frame = InternalFrame.create( ordered_dataframe=result_frame.ordered_dataframe, data_column_pandas_labels=data_column_pandas_labels, @@ -2265,19 +2258,26 @@ def reindex( new_qc = SnowflakeQueryCompiler(new_modin_frame) if method or fill_value is not np.nan: limit = kwargs.get("limit", None) + new_filter_column_snowflake_quoted_id = ( + result_frame_column_mapper.map_right_quoted_identifiers( + [_filter_column_snowflake_quoted_id] + )[0] + ) + ( + new_modin_frame, + mapper, + ) = new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( + { + new_filter_column_snowflake_quoted_id: coalesce( + col(new_filter_column_snowflake_quoted_id), + pandas_lit(False), + ) + } + ) + new_filter_column_snowflake_quoted_id = mapper[ + new_filter_column_snowflake_quoted_id + ] if method not in ["nearest", None]: - new_filter_column_snowflake_quoted_id = ( - result_frame_column_mapper.map_right_quoted_identifiers( - [_filter_column_snowflake_quoted_id] - )[0] - ) - new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( - { - new_filter_column_snowflake_quoted_id: coalesce( - new_filter_column_snowflake_quoted_id, pandas_lit(False) - ) - } - ) new_qc = SnowflakeQueryCompiler( new_modin_frame.ensure_row_position_column() ) @@ -2285,7 +2285,7 @@ def reindex( new_qc._modin_frame.row_position_snowflake_quoted_identifier ) new_qc = new_qc.sort_rows_by_column_values( - columns=["_sort_index"], + columns=new_modin_frame.index_column_pandas_labels, ascending=[True], kind="stable", na_position="last", @@ -2294,7 +2294,8 @@ def reindex( new_qc = new_qc.fillna( self_is_series=False, method=method, - limit=limit, + limit=limit, # type: ignore[arg-type] + axis=0, _filter_column_snowflake_quoted_id=new_filter_column_snowflake_quoted_id, ) new_ordered_frame = new_qc._modin_frame.ordered_dataframe.sort( @@ -2306,39 +2307,42 @@ def reindex( new_qc = SnowflakeQueryCompiler( InternalFrame.create( ordered_dataframe=new_ordered_frame, - data_column_pandas_labels=modin_frame.data_column_pandas_labels[ - :-1 - ], - data_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_right_quoted_identifiers( - modin_frame.data_column_snowflake_quoted_identifiers[ - :-1 - ] - ), - data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( - new_index_modin_frame.data_column_snowflake_quoted_identifiers - ), - ) - ) - if fill_value is not np.nan: - new_modin_frame = new_qc.fillna( - self_is_series=False, value=fill_value - )._modin_frame - new_qc = SnowflakeQueryCompiler( - InternalFrame.create( - ordered_dataframe=new_modin_frame.ordered_dataframe, - data_column_pandas_labels=new_modin_frame.data_column_pandas_labels[ + data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ :-1 ], - data_column_snowflake_quoted_identifiers=new_modin_frame.data_column_snowflake_quoted_identifiers[ + data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ :-1 ], - data_column_pandas_index_names=new_modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=new_modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=new_modin_frame.index_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, ) ) + if fill_value is not np.nan: + new_qc = new_qc.fillna( + self_is_series=False, + value=fill_value, + axis=0, + _filter_column_snowflake_quoted_id=new_filter_column_snowflake_quoted_id, + ) + if method in ["nearest", None]: + # In this case, we haven't removed the dummy column that tells us which NA values + # should not be replaced. + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_qc._modin_frame.ordered_dataframe, + data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ], + data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, + ) + ) + return new_qc return self @@ -9004,7 +9008,7 @@ def fillna( axis: Optional[Axis] = None, limit: Optional[int] = None, downcast: Optional[dict] = None, - _filter_column_snowflake_quoted_id=None, + _filter_column_snowflake_quoted_id: Optional[str] = None, ) -> "SnowflakeQueryCompiler": """ Replace NaN values using provided method. @@ -9057,39 +9061,36 @@ def fillna( func = first_value window_start = Window.CURRENT_ROW window_end = Window.UNBOUNDED_FOLLOWING - expr_maker = None + + def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: + return coalesce( + snowflake_quoted_id, + func(snowflake_quoted_id, ignore_nulls=True).over( + Window.order_by( + self._modin_frame.row_position_snowflake_quoted_identifier + ).rows_between(window_start, window_end) + ), + ) + if _filter_column_snowflake_quoted_id: # This is an internal argument passed in by reindex # that specifies a column to filter on when doing fillna # columns that this filter is True for should have their # NA values ignored. - def expr_maker(snowflake_quoted_id): - return iff( - not_(col(_filter_column_snowflake_quoted_id)), + fillna_column_map = { + snowflake_quoted_id: iff( + col(_filter_column_snowflake_quoted_id), col(snowflake_quoted_id), - func(snowflake_quoted_id, ignore_nulls=True).over( - Window.order_by( - self._modin_frame.row_position_snowflake_quoted_identifier - ).rows_between(window_start, window_end) - ), + fillna_expr(snowflake_quoted_id), ) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } else: - - def expr_maker(snowflake_quoted_id): - return func(snowflake_quoted_id, ignore_nulls=True).over( - Window.order_by( - self._modin_frame.row_position_snowflake_quoted_identifier - ).rows_between(window_start, window_end) - ) - - fillna_column_map = { - snowflake_quoted_id: coalesce( - snowflake_quoted_id, - expr_maker(snowflake_quoted_id), - ) - for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers - } + fillna_column_map = { + snowflake_quoted_id: fillna_expr(snowflake_quoted_id) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } else: fillna_column_map = { snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( @@ -9135,7 +9136,14 @@ def expr_maker(snowflake_quoted_id): for label, id_tuple in zip(labels, id_tuples): for id in id_tuple: val = label_to_value_map[label] - fillna_column_map[id] = coalesce(id, pandas_lit(val)) + if _filter_column_snowflake_quoted_id is None: + fillna_column_map[id] = coalesce(id, pandas_lit(val)) + else: + fillna_column_map[id] = iff( + col(_filter_column_snowflake_quoted_id), + col(id), + coalesce(id, pandas_lit(val)), + ) return SnowflakeQueryCompiler( self._modin_frame.update_snowflake_quoted_identifiers_with_expressions( diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py new file mode 100644 index 00000000000..df3775f6e07 --- /dev/null +++ b/tests/integ/modin/frame/test_reindex.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_reorder(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CAB")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_elements(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CABDEF")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_remove_elements(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CA")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_remove_elements(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CADEFG")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CADEFG"), fill_value=-1.0) + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_fill_method(method): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CADEFG"), method=method) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_ordered_index_unordered_new_index(): + ordered_native_dataframe = native_pd.DataFrame( + [[5] * 3, [6] * 3, [8] * 3], columns=list("ABC"), index=[5, 6, 8] + ) + ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) + eval_snowpark_pandas_result( + ordered_snow_dataframe, + ordered_native_dataframe, + lambda df: df.reindex(index=[6, 8, 7], method="ffill"), + ) From 71f90696d82fa8abb772bfded1299bd42c63c741 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 16:30:26 -0700 Subject: [PATCH 04/20] Add testing for fillna with pre-existing NA values --- tests/integ/modin/frame/test_reindex.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index df3775f6e07..f214b874161 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -78,3 +78,26 @@ def test_reindex_index_ordered_index_unordered_new_index(): ordered_native_dataframe, lambda df: df.reindex(index=[6, 8, 7], method="ffill"), ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value_with_old_na_values(): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CEBFGA"), fill_value=-1) + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_fill_method_with_old_na_values(method): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CEBFGA"), method=method) + ) From de47ea5b2f5f6ad55526c61d231a2d4a5d23341b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 17:50:18 -0700 Subject: [PATCH 05/20] Add limit to tests --- tests/integ/modin/frame/test_reindex.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index f214b874161..c61c5f89ae9 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -59,11 +59,14 @@ def test_reindex_index_fill_value(): @sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -def test_reindex_index_fill_method(method): +@pytest.mark.parametrize("limit", [None, 1, 2, 1000]) +def test_reindex_index_fill_method(method, limit): native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(index=list("CADEFG"), method=method) + snow_df, + native_df, + lambda df: df.reindex(index=list("CADEFG"), method=method, limit=limit), ) @@ -92,12 +95,15 @@ def test_reindex_index_fill_value_with_old_na_values(): @sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 1000]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -def test_reindex_index_fill_method_with_old_na_values(method): +def test_reindex_index_fill_method_with_old_na_values(limit, method): native_df = native_pd.DataFrame( [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") ) snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(index=list("CEBFGA"), method=method) + snow_df, + native_df, + lambda df: df.reindex(index=list("CEBFGA"), method=method, limit=limit), ) From db0d2374b708c7b238d083506baf3a9071cdc0ca Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 18:02:28 -0700 Subject: [PATCH 06/20] Add datetime index test --- tests/integ/modin/frame/test_reindex.py | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index c61c5f89ae9..a3a00949d8f 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -107,3 +107,33 @@ def test_reindex_index_fill_method_with_old_na_values(limit, method): native_df, lambda df: df.reindex(index=list("CEBFGA"), method=method, limit=limit), ) + + +@sql_count_checker(query_count=2, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_datetime_with_fill(limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return df.reindex( + native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result(snow_df, native_df, perform_reindex, check_freq=False) From 69afda7944c746c64f1519ca704402951e64a998 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 18:24:32 -0700 Subject: [PATCH 07/20] Add tests around monotonicity --- tests/integ/modin/frame/test_reindex.py | 117 +++++++++++++++++++++++- 1 file changed, 113 insertions(+), 4 deletions(-) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index a3a00949d8f..b3151d0a2e3 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -8,6 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -59,14 +60,46 @@ def test_reindex_index_fill_value(): @sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -@pytest.mark.parametrize("limit", [None, 1, 2, 1000]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) def test_reindex_index_fill_method(method, limit): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ACE")) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +def test_reindex_index_fill_method_pandas_negative(method, limit): + # The negative is because pandas does not support `limit` parameter + # if the index and target are not both monotonic, while we do. native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CADEFG"), method=method, limit=limit) + else: + return df.reindex(index=list("ACDEFG"), method=method, limit=limit).reindex( + list("CADEFG") + ) + eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.reindex(index=list("CADEFG"), method=method, limit=limit), + perform_reindex, ) @@ -95,17 +128,49 @@ def test_reindex_index_fill_value_with_old_na_values(): @sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("limit", [None, 1, 2, 1000]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_fill_method_with_old_na_values(limit, method): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_fill_method_with_old_na_values_pandas_negative(limit, method): native_df = native_pd.DataFrame( [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") ) snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CEBFGA"), method=method, limit=limit) + else: + return df.reindex(index=list("ABCEFG"), method=method, limit=limit).reindex( + list("CEBFGA") + ) + eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.reindex(index=list("CEBFGA"), method=method, limit=limit), + perform_reindex, ) @@ -137,3 +202,47 @@ def perform_reindex(df): ) eval_snowpark_pandas_result(snow_df, native_df, perform_reindex, check_freq=False) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_non_overlapping_index(): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) + ) + + +@sql_count_checker(query_count=2, join_count=1) +def test_reindex_index_non_overlapping_datetime_index(): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return df.reindex( + native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result(snow_df, native_df, perform_reindex, check_freq=False) + + +@sql_count_checker(query_count=1) +def test_reindex_index_non_overlapping_different_types_index_negative(): + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + with pytest.raises(SnowparkSQLException, match=".*Timestamp 'A' is not recognized"): + snow_df.reindex(list("ABC")).to_pandas() From 2e45ae115d4758279981057061c5408208995a2e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 18:46:15 -0700 Subject: [PATCH 08/20] Add xfail to tests with NA propagation --- tests/integ/modin/frame/test_reindex.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index b3151d0a2e3..c11d656f686 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -130,7 +130,16 @@ def test_reindex_index_fill_value_with_old_na_values(): @sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.xfail(reason="Cannot qualify window in fillna.") def test_reindex_index_fill_method_with_old_na_values(limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. native_df = native_pd.DataFrame( [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ACE") ) @@ -146,7 +155,16 @@ def test_reindex_index_fill_method_with_old_na_values(limit, method): @sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.xfail(reason="Cannot qualify window in fillna.") def test_reindex_index_fill_method_with_old_na_values_pandas_negative(limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. native_df = native_pd.DataFrame( [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") ) From 0f7a9e4d646b71b1b6ae95c01f581df6e0230832 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 21:40:31 -0700 Subject: [PATCH 09/20] Add reindex with axis=1 --- .../compiler/snowflake_query_compiler.py | 163 ++++++++++++++++-- 1 file changed, 148 insertions(+), 15 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index eb47f8aa2ed..d51f0c2a75e 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2344,7 +2344,73 @@ def reindex( ) return new_qc - return self + else: + method = kwargs.get("method", None) + level = kwargs.get("level", None) + limit = kwargs.get("limit", None) + tolerance = kwargs.get("tolerance", None) + fill_value = kwargs.get("fill_value", None) + if not self.columns.is_lazy: + self.columns.to_pandas().reindex( + labels, method, level, limit, tolerance + ) + data_column_pandas_labels = [] + data_column_snowflake_quoted_identifiers = [] + modin_frame = self._modin_frame + for label in labels: + data_column_pandas_labels += [label] + if label in self._modin_frame.data_column_pandas_labels: + snowflake_quoted_ids = list( + modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + [label], include_index=False + )[ + 0 + ] + ) + data_column_snowflake_quoted_identifiers += snowflake_quoted_ids + if len(snowflake_quoted_ids) > 1: + data_column_pandas_labels += [label] * ( + len(snowflake_quoted_ids) - 1 + ) + else: + modin_frame = modin_frame.append_column(label, pandas_lit(np.nan)) + data_column_snowflake_quoted_identifiers += [ + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ] + new_modin_frame = InternalFrame.create( + ordered_dataframe=modin_frame.ordered_dataframe, + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, + ) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + ordered_columns = sorted(data_column_pandas_labels) + columns_to_ignore = [ + c in self._modin_frame.data_column_pandas_labels + for c in ordered_columns + ] + if method not in ["nearest", None]: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + ).fillna( + method=method, limit=limit, _columns_to_ignore=columns_to_ignore, self_is_series=False # type: ignore[arg-type] + ) + if fill_value is not np.nan: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + ).fillna( + value=fill_value, + _columns_to_ignore=columns_to_ignore, + self_is_series=False, + ) + if method not in ["nearest", None] or fill_value is not np.nan: + # We only need to reorder the columns if we sorted them above for filling. + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=data_column_pandas_labels + ) + return new_qc def _parse_names_arguments_from_reset_index( self, @@ -8957,6 +9023,7 @@ def _make_fill_expression_for_column_wise_fillna( snowflake_quoted_identifier: str, method: FillNAMethod, limit: Optional[int] = None, + _columns_to_ignore: Optional[list[bool]] = None, ) -> SnowparkColumn: """ Helper function to get the Snowpark Column expression corresponding to snowflake_quoted_id when doing a column wise fillna. @@ -8982,6 +9049,7 @@ def _make_fill_expression_for_column_wise_fillna( col_pos = self._modin_frame.data_column_snowflake_quoted_identifiers.index( snowflake_quoted_identifier ) + # If we are looking at the first column and doing an ffill, or looking at the last column and doing a bfill, # there are no other columns for us to coalesce with, so returning coalesce will error since it will be a # coalesce with one column. Instead, we just return the column. @@ -8989,25 +9057,72 @@ def _make_fill_expression_for_column_wise_fillna( col_pos == len_ids - 1 and not method_is_ffill ): return col(snowflake_quoted_identifier) + if method_is_ffill: start_pos = 0 if limit is not None: start_pos = max(col_pos - limit, start_pos) + columns_to_include = ( + self._modin_frame.data_column_snowflake_quoted_identifiers[ + start_pos:col_pos + ][::-1] + ) + if _columns_to_ignore: + # When _colums_to_ignore is set, we are using this to perform a column-wise fill for reindex. + # In that case, we will do two things: + # 1. We must filter so that the only columns that appear in the coalesce are columns that + # were previously a part of the dataframe (filter using the booleans in _columns_to_ignore). + # 2. We must propagate NA values from existing columns (so if we call ffill, and we are filling + # new column 'C', and old column 'A' has value 4 and old column 'B' has value NaN, we must fill + # column 'C' with NaN, unlike with standard fillna, where we would propagate 4.) + # + # Now, columns_to_include includes the columns that we can use to fill the value in this column; + # however, since we want to propagate NA values, we can find the first column in the list that is + # an old column, and only pass that in to coalesce, so that if there is a NaN, it will be propagated, + # and if there isn't, the correct value will be propagated. + column = None + for col_name, ignore_bool in zip( + columns_to_include, _columns_to_ignore[start_pos:col_pos][::-1] + ): + if ignore_bool: + # This means that this is a column from the original data. + column = col_name + break + if column is None: + columns_to_include = columns_to_include[0:1] + else: + columns_to_include = [column] return coalesce( snowflake_quoted_identifier, - *self._modin_frame.data_column_snowflake_quoted_identifiers[ - start_pos:col_pos - ][::-1], + *columns_to_include, ) else: - start_pos = len_ids + end_pos = len_ids if limit is not None: - start_pos = min(col_pos + limit, len_ids) + # Add 1 since end index is exclusive. + end_pos = min(col_pos + limit + 1, len_ids) + columns_to_include = ( + self._modin_frame.data_column_snowflake_quoted_identifiers[ + col_pos:end_pos + ] + ) + if _columns_to_ignore: + column = None + for col_name, ignore_bool in zip( + columns_to_include, _columns_to_ignore[col_pos:end_pos] + ): + if ignore_bool: + # This means that this is a column from the original data. + column = col_name + break + if column is None: + columns_to_include = columns_to_include[0:1] + else: + columns_to_include = [column] + return coalesce( snowflake_quoted_identifier, - *self._modin_frame.data_column_snowflake_quoted_identifiers[ - start_pos:col_pos:-1 - ][::-1], + *columns_to_include, ) def fillna( @@ -9020,6 +9135,7 @@ def fillna( limit: Optional[int] = None, downcast: Optional[dict] = None, _filter_column_snowflake_quoted_id: Optional[str] = None, + _columns_to_ignore: Optional[list[bool]] = None, ) -> "SnowflakeQueryCompiler": """ Replace NaN values using provided method. @@ -9032,6 +9148,7 @@ def fillna( limit : int, optional downcast : dict, optional _filter_column_snowflake_quoted_id : str, optional + _columns_to_ignore : list[bool], optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. @@ -9108,12 +9225,28 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers } else: - fillna_column_map = { - snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( - snowflake_quoted_id, method, limit=limit - ) - for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers - } + if _columns_to_ignore is None: + fillna_column_map = { + snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( + snowflake_quoted_id, + method, + limit=limit, + ) + for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers + } + else: + fillna_column_map = { + snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( + snowflake_quoted_id, + method, + limit=limit, + _columns_to_ignore=_columns_to_ignore, + ) + for i, snowflake_quoted_id in enumerate( + self._modin_frame.data_column_snowflake_quoted_identifiers + ) + if not _columns_to_ignore[i] + } # case 3: fillna with a mapping else: # we create a mapping from column label to the fillin value and use coalesce to implement fillna From 2134be3369d458c80353779b0a77b919e504a4aa Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 21:42:25 -0700 Subject: [PATCH 10/20] Update changelog --- CHANGELOG.md | 1 + docs/source/modin/supported/dataframe_supported.rst | 4 ++-- docs/source/modin/supported/series_supported.rst | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 896b6ea5728..074af3b2270 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ - Added partial support for `Series.str.translate` where the values in the `table` are single-codepoint strings. - Added support for `DataFrame.corr`. - Added support for `limit` parameter when `method` parameter is used in `fillna`. +- Added support for `DataFrame.reindex` and `Series.reindex`. #### Bug Fixes - Fixed an issue when using np.where and df.where when the scalar 'other' is the literal 0. diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 7d3bc1ad8d5..0e131955c97 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -173,7 +173,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``explode`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``ffill`` | P | | ``N`` if parameter ``downcast`` is set. ``limit`` | +| ``ffill`` | P | ``downcast`` | ``N`` if parameter ``downcast`` is set. ``limit`` | | | | | parameter only supported if ``method`` parameter | | | | | is used. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -330,7 +330,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdiv`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | N | | | +| ``reindex`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 78fc92b1994..83ae744ddd6 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -186,7 +186,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``factorize`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``ffill`` | P | | ``N`` if parameter ``downcast`` is set. ``limit`` | +| ``ffill`` | P | ``downcast`` | ``N`` if parameter ``downcast`` is set. ``limit`` | | | | | parameter only supported if ``method`` parameter | | | | | is used. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -322,7 +322,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdivmod`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | N | | | +| ``reindex`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ From 4558143f4c6a2f15e58c2d70f5cff38dfb235185 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 22:11:28 -0700 Subject: [PATCH 11/20] Add testing for axis=1 --- .../compiler/snowflake_query_compiler.py | 28 +- tests/integ/modin/frame/test_reindex.py | 637 +++++++++++------- 2 files changed, 407 insertions(+), 258 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index d51f0c2a75e..b56dffbf6b1 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -9282,17 +9282,25 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: include_index=False, ) fillna_column_map = {} + if _columns_to_ignore is not None: + columns_to_ignore = itertools.compress( + self._modin_frame.data_column_pandas_labels, + _columns_to_ignore, + ) + else: + columns_to_ignore = [] # type:ignore [assignment] for label, id_tuple in zip(labels, id_tuples): - for id in id_tuple: - val = label_to_value_map[label] - if _filter_column_snowflake_quoted_id is None: - fillna_column_map[id] = coalesce(id, pandas_lit(val)) - else: - fillna_column_map[id] = iff( - col(_filter_column_snowflake_quoted_id), - col(id), - coalesce(id, pandas_lit(val)), - ) + if label not in columns_to_ignore: + for id in id_tuple: + val = label_to_value_map[label] + if _filter_column_snowflake_quoted_id is None: + fillna_column_map[id] = coalesce(id, pandas_lit(val)) + else: + fillna_column_map[id] = iff( + col(_filter_column_snowflake_quoted_id), + col(id), + coalesce(id, pandas_lit(val)), + ) return SnowflakeQueryCompiler( self._modin_frame.update_snowflake_quoted_identifiers_with_expressions( diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index c11d656f686..dc4157bb223 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -9,258 +9,399 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_basic_reorder(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CAB")) - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_basic_add_elements(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CABDEF")) - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_basic_remove_elements(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CA")) - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_basic_add_remove_elements(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(index=list("CADEFG")) - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_fill_value(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(index=list("CADEFG"), fill_value=-1.0) - ) - - -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -def test_reindex_index_fill_method(method, limit): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ACE")) - snow_df = pd.DataFrame(native_df) - - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), - ) - - -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -def test_reindex_index_fill_method_pandas_negative(method, limit): - # The negative is because pandas does not support `limit` parameter - # if the index and target are not both monotonic, while we do. - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - if limit is not None: - method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) - with pytest.raises( - ValueError, - match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", - ): - native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) +class TestReindexAxis0: + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_reorder(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CAB")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_add_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CABDEF")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_remove_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("CA")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_basic_add_remove_elements(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(index=list("CADEFG")) + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_fill_value(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("CADEFG"), fill_value=-1.0), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_index_fill_method(self, method, limit): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ACE")) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_index_fill_method_pandas_negative(self, method, limit): + # The negative is because pandas does not support `limit` parameter + # if the index and target are not both monotonic, while we do. + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CADEFG"), method=method, limit=limit) + else: + return df.reindex( + index=list("ACDEFG"), method=method, limit=limit + ).reindex(list("CADEFG")) + + eval_snowpark_pandas_result( + snow_df, + native_df, + perform_reindex, + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_ordered_index_unordered_new_index(self): + ordered_native_dataframe = native_pd.DataFrame( + [[5] * 3, [6] * 3, [8] * 3], columns=list("ABC"), index=[5, 6, 8] + ) + ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) + eval_snowpark_pandas_result( + ordered_snow_dataframe, + ordered_native_dataframe, + lambda df: df.reindex(index=[6, 8, 7], method="ffill"), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_fill_value_with_old_na_values(self): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("CEBFGA"), fill_value=-1), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.xfail(reason="Cannot qualify window in fillna.") + def test_reindex_index_fill_method_with_old_na_values(self, limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.xfail(reason="Cannot qualify window in fillna.") + def test_reindex_index_fill_method_with_old_na_values_pandas_negative( + self, limit, method + ): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original DataFrame + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original DataFrame. + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex(index=list("CEBFGA"), method=method, limit=limit) + else: + return df.reindex( + index=list("ABCEFG"), method=method, limit=limit + ).reindex(list("CEBFGA")) + + eval_snowpark_pandas_result( + snow_df, + native_df, + perform_reindex, + ) + + @sql_count_checker(query_count=2, join_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_index_datetime_with_fill(self, limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return df.reindex( + native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_reindex_index_non_overlapping_index(self): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) + ) + + @sql_count_checker(query_count=2, join_count=1) + def test_reindex_index_non_overlapping_datetime_index(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return df.reindex( + native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1) + def test_reindex_index_non_overlapping_different_types_index_negative(self): + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) - def perform_reindex(df): - if isinstance(df, pd.DataFrame): - return df.reindex(index=list("CADEFG"), method=method, limit=limit) - else: - return df.reindex(index=list("ACDEFG"), method=method, limit=limit).reindex( - list("CADEFG") - ) - - eval_snowpark_pandas_result( - snow_df, - native_df, - perform_reindex, - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_ordered_index_unordered_new_index(): - ordered_native_dataframe = native_pd.DataFrame( - [[5] * 3, [6] * 3, [8] * 3], columns=list("ABC"), index=[5, 6, 8] - ) - ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) - eval_snowpark_pandas_result( - ordered_snow_dataframe, - ordered_native_dataframe, - lambda df: df.reindex(index=[6, 8, 7], method="ffill"), - ) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_fill_value_with_old_na_values(): - native_df = native_pd.DataFrame( - [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") - ) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(index=list("CEBFGA"), fill_value=-1) - ) - - -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -@pytest.mark.xfail(reason="Cannot qualify window in fillna.") -def test_reindex_index_fill_method_with_old_na_values(limit, method): - # Say there are NA values in the data before reindex. reindex is called with - # ffill as the method, and there is a new NA value in the row following the - # row with the pre-existing NA value. In this case, the ffilled value should - # be an NA value (rather than looking to previous rows for a non-NA value). - # To support this, we would need to have `ignore_nulls=False`, but we would - # also need to qualify the window for values that were in the original DataFrame - # as otherwise, if we have multiple new index values that have NA values, we would - # pick these NA values instead of finding a non-NA value from the original DataFrame. - native_df = native_pd.DataFrame( - [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ACE") - ) - snow_df = pd.DataFrame(native_df) - - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.reindex(index=list("ABCDEFGH"), method=method, limit=limit), - ) - - -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -@pytest.mark.xfail(reason="Cannot qualify window in fillna.") -def test_reindex_index_fill_method_with_old_na_values_pandas_negative(limit, method): - # Say there are NA values in the data before reindex. reindex is called with - # ffill as the method, and there is a new NA value in the row following the - # row with the pre-existing NA value. In this case, the ffilled value should - # be an NA value (rather than looking to previous rows for a non-NA value). - # To support this, we would need to have `ignore_nulls=False`, but we would - # also need to qualify the window for values that were in the original DataFrame - # as otherwise, if we have multiple new index values that have NA values, we would - # pick these NA values instead of finding a non-NA value from the original DataFrame. - native_df = native_pd.DataFrame( - [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("ABC") - ) - snow_df = pd.DataFrame(native_df) - if limit is not None: - method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) with pytest.raises( - ValueError, - match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + SnowparkSQLException, match=".*Timestamp 'A' is not recognized" ): - native_df.reindex(index=list("CEBFGA"), method=method, limit=limit) - - def perform_reindex(df): - if isinstance(df, pd.DataFrame): - return df.reindex(index=list("CEBFGA"), method=method, limit=limit) - else: - return df.reindex(index=list("ABCEFG"), method=method, limit=limit).reindex( - list("CEBFGA") - ) - - eval_snowpark_pandas_result( - snow_df, - native_df, - perform_reindex, - ) - - -@sql_count_checker(query_count=2, join_count=1) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) -def test_reindex_index_datetime_with_fill(limit, method): - date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") - native_df = native_pd.DataFrame( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index - ) - date_index = pd.date_range("1/1/2010", periods=6, freq="D") - snow_df = pd.DataFrame( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index - ) - - def perform_reindex(df): - if isinstance(df, pd.DataFrame): - return df.reindex( - pd.date_range("12/29/2009", periods=10, freq="D"), - method=method, - limit=limit, - ) + snow_df.reindex(list("ABC")).to_pandas() + + +class TestReindexAxis1: + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_reorder(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CAB")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_add_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CABDEF")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_remove_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("CA")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_basic_add_remove_elements(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(columns=list("CADEFG")) + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_fill_value(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CADEFG"), fill_value=-1.0), + ) + + @sql_count_checker(query_count=1) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + def test_reindex_columns_fill_method(self, method, limit): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("ABCDEFGH"), method=method, limit=limit), + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_ordered_columns_unordered_new_columns(self): + ordered_native_dataframe = native_pd.DataFrame( + np.array([[5] * 3, [6] * 3, [8] * 3]).T, + index=list("ABC"), + columns=[5, 6, 8], + ) + ordered_snow_dataframe = pd.DataFrame(ordered_native_dataframe) + eval_snowpark_pandas_result( + ordered_snow_dataframe, + ordered_native_dataframe, + lambda df: df.reindex(columns=[6, 8, 7], method="ffill"), + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_fill_value_with_old_na_values(self): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CEBFGA"), fill_value=-1), + ) + + @sql_count_checker(query_count=1) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_fill_method_with_old_na_values(self, limit, method): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ACE") + ) + snow_df = pd.DataFrame(native_df) + + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("ABCDEFGH"), method=method, limit=limit), + ) + + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_fill_method_with_old_na_values_negative( + self, limit, method + ): + native_df = native_pd.DataFrame( + [[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + match_str = f"limit argument for '{method_str}' method only well-defined if index and target are monotonic" + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex( + columns=list("CEBFGA"), method=method, limit=limit + ), + expect_exception=True, + assert_exception_equal=True, + expect_exception_match=match_str, + expect_exception_type=ValueError, + ) else: - return df.reindex( - native_pd.date_range("12/29/2009", periods=10, freq="D"), - method=method, - limit=limit, - ) - - eval_snowpark_pandas_result(snow_df, native_df, perform_reindex, check_freq=False) - - -@sql_count_checker(query_count=1, join_count=1) -def test_reindex_index_non_overlapping_index(): - native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABC")) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) - ) - - -@sql_count_checker(query_count=2, join_count=1) -def test_reindex_index_non_overlapping_datetime_index(): - date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") - native_df = native_pd.DataFrame( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index - ) - date_index = pd.date_range("1/1/2010", periods=6, freq="D") - snow_df = pd.DataFrame( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index - ) - - def perform_reindex(df): - if isinstance(df, pd.DataFrame): - return df.reindex( - pd.date_range("12/29/2008", periods=10, freq="D"), - ) - else: - return df.reindex( - native_pd.date_range("12/29/2008", periods=10, freq="D"), - ) - - eval_snowpark_pandas_result(snow_df, native_df, perform_reindex, check_freq=False) - - -@sql_count_checker(query_count=1) -def test_reindex_index_non_overlapping_different_types_index_negative(): - date_index = pd.date_range("1/1/2010", periods=6, freq="D") - snow_df = pd.DataFrame( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index - ) - - with pytest.raises(SnowparkSQLException, match=".*Timestamp 'A' is not recognized"): - snow_df.reindex(list("ABC")).to_pandas() + with SqlCounter(query_count=1): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=list("CEBFGA"), method=method), + ) From 1866efdfe529bf0e87df76ea86a7cc5102c47d93 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 10 Jul 2024 22:21:37 -0700 Subject: [PATCH 12/20] Add more extensive testing for reindex on columns --- .../compiler/snowflake_query_compiler.py | 4 +- tests/integ/modin/frame/test_reindex.py | 81 +++++++++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index b56dffbf6b1..3fa16712aa3 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2210,7 +2210,7 @@ def reindex( new_index_modin_frame = new_index_qc._modin_frame modin_frame = self._modin_frame method = kwargs.get("method", None) - fill_value = kwargs.get("fill_value", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] _filter_column_snowflake_quoted_id = None if fill_value is not np.nan or method: # If we are filling values, reindex ignores NaN values that @@ -2349,7 +2349,7 @@ def reindex( level = kwargs.get("level", None) limit = kwargs.get("limit", None) tolerance = kwargs.get("tolerance", None) - fill_value = kwargs.get("fill_value", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] if not self.columns.is_lazy: self.columns.to_pandas().reindex( labels, method, level, limit, tolerance diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index dc4157bb223..134941f5165 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -405,3 +405,84 @@ def test_reindex_columns_fill_method_with_old_na_values_negative( native_df, lambda df: df.reindex(columns=list("CEBFGA"), method=method), ) + + @sql_count_checker(query_count=5) + @pytest.mark.parametrize("limit", [None, 1, 2, 100]) + @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) + def test_reindex_columns_datetime_with_fill(self, limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + columns=pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return df.reindex( + columns=native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=1) + def test_reindex_columns_non_overlapping_columns(self): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("EFG")) + ) + + @sql_count_checker(query_count=5) + def test_reindex_columns_non_overlapping_datetime_columns(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + def perform_reindex(df): + if isinstance(df, pd.DataFrame): + return df.reindex( + columns=pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return df.reindex( + columns=native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_df, native_df, perform_reindex, check_freq=False + ) + + @sql_count_checker(query_count=2) + def test_reindex_columns_non_overlapping_different_types_columns(self): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_df = native_pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_df = pd.DataFrame( + [[100, 101, np.nan, 100, 89, 88]], index=["prices"], columns=date_index + ) + + eval_snowpark_pandas_result( + snow_df, native_df, lambda df: df.reindex(columns=list("ABCD")) + ) From 5747df35e48e4210e62d6d9d4ed330729e5db604 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 11 Jul 2024 17:54:57 -0700 Subject: [PATCH 13/20] Add testing for series.reindex, address review comments --- .../modin/supported/dataframe_supported.rst | 4 +- .../modin/supported/series_supported.rst | 4 +- src/snowflake/snowpark/modin/pandas/base.py | 5 +- src/snowflake/snowpark/modin/pandas/series.py | 11 +- .../compiler/snowflake_query_compiler.py | 486 +++++++++++------- tests/integ/modin/frame/test_reindex.py | 17 + tests/integ/modin/series/test_reindex.py | 335 ++++++++++++ 7 files changed, 655 insertions(+), 207 deletions(-) create mode 100644 tests/integ/modin/series/test_reindex.py diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 0e131955c97..762e825bafd 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -173,7 +173,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``explode`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``ffill`` | P | ``downcast`` | ``N`` if parameter ``downcast`` is set. ``limit`` | +| ``ffill`` | P | | ``N`` if parameter ``downcast`` is set. ``limit`` | | | | | parameter only supported if ``method`` parameter | | | | | is used. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -330,7 +330,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdiv`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | Y | | | +| ``reindex`` | P | | ``N`` if axis is MultiIndex. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 83ae744ddd6..71c9de2e2b2 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -186,7 +186,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``factorize`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``ffill`` | P | ``downcast`` | ``N`` if parameter ``downcast`` is set. ``limit`` | +| ``ffill`` | P | | ``N`` if parameter ``downcast`` is set. ``limit`` | | | | | parameter only supported if ``method`` parameter | | | | | is used. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ @@ -322,7 +322,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdivmod`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | Y | | | +| ``reindex`` | P | | ``N`` if the series has MultiIndex | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 8d06ecb5993..a06ea2d8ede 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -2595,7 +2595,10 @@ def reindex( Conform `BasePandasDataset` to new index with optional filling logic. """ # TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset - + if kwargs.get("limit", None) is not None and kwargs.get("method", None) is None: + raise ValueError( + "limit argument only valid if doing pad, backfill or nearest reindexing" + ) new_query_compiler = None if index is not None: if not isinstance(index, pandas.Index) or not index.equals(self.index): diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index e7fd0aa8fce..b12ae2124be 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -1658,10 +1658,12 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 def reindex(self, *args, **kwargs): if args: if len(args) > 1: - raise TypeError("Only one positional argument ('index') is allowed") + raise TypeError( + "Series.reindex() takes from 1 to 2 positional arguments but 3 were given" + ) if "index" in kwargs: raise TypeError( - "'index' passed as both positional and keyword argument" + "Series.reindex() got multiple values for argument 'index'" ) kwargs.update({"index": args[0]}) index = kwargs.pop("index", None) @@ -1671,10 +1673,11 @@ def reindex(self, *args, **kwargs): limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) + kwargs.pop("axis", None) if kwargs: raise TypeError( - "reindex() got an unexpected keyword " - + f'argument "{list(kwargs.keys())[0]}"' + "Series.reindex() got an unexpected keyword " + + f"argument '{list(kwargs.keys())[0]}'" ) return super().reindex( index=index, diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 3fa16712aa3..51b3b1e271b 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2202,111 +2202,169 @@ def reindex( Returns ------- - BaseQueryCompiler + SnowflakeQueryCompiler QueryCompiler with aligned axis. """ + if self.is_multiindex(axis=axis): + raise NotImplementedError( + "Snowpark pandas doesn't support `reindex` with MultiIndex" + ) if axis == 0: - new_index_qc = pd.Series(labels)._query_compiler - new_index_modin_frame = new_index_qc._modin_frame - modin_frame = self._modin_frame - method = kwargs.get("method", None) - fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] - _filter_column_snowflake_quoted_id = None - if fill_value is not np.nan or method: - # If we are filling values, reindex ignores NaN values that - # were previously present in the DataFrame before reindexing. - # In order to differentiate between pre-existing NaN values, - # and new NaN values caused by new index values that are not - # present, we can attach a boolean column of all `True`'s to - # self's modin_frame. After the left join with the new index - # rows that were present in self will have a True value, while - # rows that were not present in self will have a NA value. We can - # filter by which rows have an NA value for the dummy column to determine - # between pre-existing NaN's, and NaN's that were introduced because of new - # values in the index that are not present in the old index. If a row - # has a True value for the dummy column, any NaN's in it should be ignored - # as it is a pre-existing NaN value that we **should not** fill. - modin_frame = modin_frame.append_column( - "dummy_reindex_column_for_fill", pandas_lit(True) - ) - _filter_column_snowflake_quoted_id = ( - modin_frame.data_column_snowflake_quoted_identifiers[-1] - ) - result_frame, result_frame_column_mapper = join_utils.join( - new_index_modin_frame, - modin_frame, - how="left", - left_on=new_index_modin_frame.data_column_snowflake_quoted_identifiers, - right_on=modin_frame.index_column_snowflake_quoted_identifiers, + return self._reindex_axis_0(labels=labels, **kwargs) + else: + return self._reindex_axis_1(labels=labels, **kwargs) + + def _reindex_axis_0( + self, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new index. + + Parameters + ---------- + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with aligned axis. + """ + new_index_qc = pd.Series(labels)._query_compiler + new_index_modin_frame = new_index_qc._modin_frame + modin_frame = self._modin_frame + method = kwargs.get("method", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] + _filter_column_snowflake_quoted_id = None + if fill_value is not np.nan or method: + # If we are filling values, reindex ignores NaN values that + # were previously present in the DataFrame before reindexing. + # In order to differentiate between pre-existing NaN values, + # and new NaN values caused by new index values that are not + # present, we can attach a boolean column of all `True`'s to + # self's modin_frame. After the left join with the new index + # rows that were present in self will have a True value, while + # rows that were not present in self will have a NA value. We can + # filter by which rows have an NA value for the dummy column to determine + # between pre-existing NaN's, and NaN's that were introduced because of new + # values in the index that are not present in the old index. If a row + # has a True value for the dummy column, any NaN's in it should be ignored + # as it is a pre-existing NaN value that we **should not** fill. + modin_frame = modin_frame.append_column( + "dummy_reindex_column_for_fill", pandas_lit(True) + ) + _filter_column_snowflake_quoted_id = ( + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ) + result_frame, result_frame_column_mapper = join_utils.join( + new_index_modin_frame, + modin_frame, + how="left", + left_on=new_index_modin_frame.data_column_snowflake_quoted_identifiers, + right_on=modin_frame.index_column_snowflake_quoted_identifiers, + ) + data_column_pandas_labels = modin_frame.data_column_pandas_labels + data_column_snowflake_quoted_identifiers = ( + result_frame_column_mapper.map_right_quoted_identifiers( + modin_frame.data_column_snowflake_quoted_identifiers ) - data_column_pandas_labels = modin_frame.data_column_pandas_labels - data_column_snowflake_quoted_identifiers = ( + ) + new_modin_frame = InternalFrame.create( + ordered_dataframe=result_frame.ordered_dataframe, + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( + new_index_modin_frame.data_column_snowflake_quoted_identifiers + ), + ) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + if method or fill_value is not np.nan: + limit = kwargs.get("limit", None) + new_filter_column_snowflake_quoted_id = ( result_frame_column_mapper.map_right_quoted_identifiers( - modin_frame.data_column_snowflake_quoted_identifiers - ) + [_filter_column_snowflake_quoted_id] + )[0] ) - new_modin_frame = InternalFrame.create( - ordered_dataframe=result_frame.ordered_dataframe, - data_column_pandas_labels=data_column_pandas_labels, - data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, - data_column_pandas_index_names=modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers( - new_index_modin_frame.data_column_snowflake_quoted_identifiers - ), + ( + new_modin_frame, + mapper, + ) = new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( + { + new_filter_column_snowflake_quoted_id: coalesce( + col(new_filter_column_snowflake_quoted_id), + pandas_lit(False), + ) + } ) - new_qc = SnowflakeQueryCompiler(new_modin_frame) - if method or fill_value is not np.nan: - limit = kwargs.get("limit", None) - new_filter_column_snowflake_quoted_id = ( - result_frame_column_mapper.map_right_quoted_identifiers( - [_filter_column_snowflake_quoted_id] - )[0] + new_filter_column_snowflake_quoted_id = mapper[ + new_filter_column_snowflake_quoted_id + ] + if method not in ["nearest", None]: + new_qc = SnowflakeQueryCompiler( + new_modin_frame.ensure_row_position_column() ) - ( - new_modin_frame, - mapper, - ) = new_modin_frame.update_snowflake_quoted_identifiers_with_expressions( - { - new_filter_column_snowflake_quoted_id: coalesce( - col(new_filter_column_snowflake_quoted_id), - pandas_lit(False), - ) - } + ordering_column = ( + new_qc._modin_frame.row_position_snowflake_quoted_identifier ) - new_filter_column_snowflake_quoted_id = mapper[ - new_filter_column_snowflake_quoted_id - ] - if method not in ["nearest", None]: - new_qc = SnowflakeQueryCompiler( - new_modin_frame.ensure_row_position_column() - ) - ordering_column = ( - new_qc._modin_frame.row_position_snowflake_quoted_identifier - ) - new_qc = new_qc.sort_rows_by_column_values( - columns=new_modin_frame.index_column_pandas_labels, - ascending=[True], - kind="stable", - na_position="last", - ignore_index=False, - ) - new_qc = new_qc.fillna( - self_is_series=False, - method=method, - limit=limit, # type: ignore[arg-type] - axis=0, - _filter_column_snowflake_quoted_id=new_filter_column_snowflake_quoted_id, - ) - new_ordered_frame = new_qc._modin_frame.ordered_dataframe.sort( - OrderingColumn(snowflake_quoted_identifier=ordering_column) - ) - new_ordered_frame.row_position_snowflake_quoted_identifier = ( - ordering_column + new_qc = new_qc.sort_rows_by_column_values( + columns=new_modin_frame.index_column_pandas_labels, + ascending=[True], + kind="stable", + na_position="last", + ignore_index=False, + ) + new_qc = new_qc._fillna_with_masking( + self_is_series=False, + method=method, + limit=limit, # type: ignore[arg-type] + axis=0, + row_mask_snowflake_quoted_identifier=new_filter_column_snowflake_quoted_id, + ) + new_ordered_frame = new_qc._modin_frame.ordered_dataframe.sort( + OrderingColumn(snowflake_quoted_identifier=ordering_column) + ) + new_ordered_frame.row_position_snowflake_quoted_identifier = ( + ordering_column + ) + new_qc = SnowflakeQueryCompiler( + InternalFrame.create( + ordered_dataframe=new_ordered_frame, + data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ + :-1 + ], + data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ + :-1 + ], + data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, ) + ) + if fill_value is not np.nan: + new_qc = new_qc._fillna_with_masking( + self_is_series=False, + value=fill_value, + axis=0, + row_mask_snowflake_quoted_identifier=new_filter_column_snowflake_quoted_id, + ) + if method in ["nearest", None]: + # In this case, we haven't removed the dummy column that tells us which NA values + # should not be replaced. new_qc = SnowflakeQueryCompiler( InternalFrame.create( - ordered_dataframe=new_ordered_frame, + ordered_dataframe=new_qc._modin_frame.ordered_dataframe, data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ :-1 ], @@ -2318,99 +2376,96 @@ def reindex( index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, ) ) - if fill_value is not np.nan: - new_qc = new_qc.fillna( - self_is_series=False, - value=fill_value, - axis=0, - _filter_column_snowflake_quoted_id=new_filter_column_snowflake_quoted_id, - ) - if method in ["nearest", None]: - # In this case, we haven't removed the dummy column that tells us which NA values - # should not be replaced. - new_qc = SnowflakeQueryCompiler( - InternalFrame.create( - ordered_dataframe=new_qc._modin_frame.ordered_dataframe, - data_column_pandas_labels=new_qc._modin_frame.data_column_pandas_labels[ - :-1 - ], - data_column_snowflake_quoted_identifiers=new_qc._modin_frame.data_column_snowflake_quoted_identifiers[ - :-1 - ], - data_column_pandas_index_names=new_qc._modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=new_qc._modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=new_qc._modin_frame.index_column_snowflake_quoted_identifiers, - ) - ) - return new_qc - else: - method = kwargs.get("method", None) - level = kwargs.get("level", None) - limit = kwargs.get("limit", None) - tolerance = kwargs.get("tolerance", None) - fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] - if not self.columns.is_lazy: - self.columns.to_pandas().reindex( - labels, method, level, limit, tolerance + return new_qc + + def _reindex_axis_1( + self, + labels: Union[pandas.Index, list[Any]], + **kwargs: dict[str, Any], + ) -> "SnowflakeQueryCompiler": + """ + Align QueryCompiler data with a new column. + + Parameters + ---------- + labels : list-like + Index-labels to align with. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} + Method to use for filling holes in reindexed frame. + fill_value : scalar + Value to use for missing values in the resulted frame. + limit : int + tolerance : int + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler + QueryCompiler with aligned axis. + """ + method = kwargs.get("method", None) + level = kwargs.get("level", None) + limit = kwargs.get("limit", None) + tolerance = kwargs.get("tolerance", None) + fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] + if not self.columns.is_lazy: + self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) + data_column_pandas_labels = [] + data_column_snowflake_quoted_identifiers = [] + modin_frame = self._modin_frame + for label in labels: + data_column_pandas_labels += [label] + if label in self._modin_frame.data_column_pandas_labels: + snowflake_quoted_ids = list( + modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( + [label], include_index=False + )[0] ) - data_column_pandas_labels = [] - data_column_snowflake_quoted_identifiers = [] - modin_frame = self._modin_frame - for label in labels: - data_column_pandas_labels += [label] - if label in self._modin_frame.data_column_pandas_labels: - snowflake_quoted_ids = list( - modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( - [label], include_index=False - )[ - 0 - ] + data_column_snowflake_quoted_identifiers += snowflake_quoted_ids + if len(snowflake_quoted_ids) > 1: + data_column_pandas_labels += [label] * ( + len(snowflake_quoted_ids) - 1 ) - data_column_snowflake_quoted_identifiers += snowflake_quoted_ids - if len(snowflake_quoted_ids) > 1: - data_column_pandas_labels += [label] * ( - len(snowflake_quoted_ids) - 1 - ) - else: - modin_frame = modin_frame.append_column(label, pandas_lit(np.nan)) - data_column_snowflake_quoted_identifiers += [ - modin_frame.data_column_snowflake_quoted_identifiers[-1] - ] - new_modin_frame = InternalFrame.create( - ordered_dataframe=modin_frame.ordered_dataframe, - data_column_pandas_labels=data_column_pandas_labels, - data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, - data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, - index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, + else: + modin_frame = modin_frame.append_column(label, pandas_lit(np.nan)) + data_column_snowflake_quoted_identifiers += [ + modin_frame.data_column_snowflake_quoted_identifiers[-1] + ] + new_modin_frame = InternalFrame.create( + ordered_dataframe=modin_frame.ordered_dataframe, + data_column_pandas_labels=data_column_pandas_labels, + data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers, + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, + index_column_pandas_labels=self._modin_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=self._modin_frame.index_column_snowflake_quoted_identifiers, + ) + new_qc = SnowflakeQueryCompiler(new_modin_frame) + ordered_columns = sorted(data_column_pandas_labels) + columns_to_ignore = [ + c in self._modin_frame.data_column_pandas_labels for c in ordered_columns + ] + if method not in ["nearest", None]: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + )._fillna_with_masking( + method=method, limit=limit, columns_mask=columns_to_ignore, self_is_series=False # type: ignore[arg-type] + ) + if fill_value is not np.nan: + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=ordered_columns + )._fillna_with_masking( + value=fill_value, + columns_mask=columns_to_ignore, + self_is_series=False, + ) + if method not in ["nearest", None] or fill_value is not np.nan: + # We only need to reorder the columns if we sorted them above for filling. + new_qc = new_qc.take_2d_labels( + index=slice(None), columns=data_column_pandas_labels ) - new_qc = SnowflakeQueryCompiler(new_modin_frame) - ordered_columns = sorted(data_column_pandas_labels) - columns_to_ignore = [ - c in self._modin_frame.data_column_pandas_labels - for c in ordered_columns - ] - if method not in ["nearest", None]: - new_qc = new_qc.take_2d_labels( - index=slice(None), columns=ordered_columns - ).fillna( - method=method, limit=limit, _columns_to_ignore=columns_to_ignore, self_is_series=False # type: ignore[arg-type] - ) - if fill_value is not np.nan: - new_qc = new_qc.take_2d_labels( - index=slice(None), columns=ordered_columns - ).fillna( - value=fill_value, - _columns_to_ignore=columns_to_ignore, - self_is_series=False, - ) - if method not in ["nearest", None] or fill_value is not np.nan: - # We only need to reorder the columns if we sorted them above for filling. - new_qc = new_qc.take_2d_labels( - index=slice(None), columns=data_column_pandas_labels - ) - return new_qc + return new_qc def _parse_names_arguments_from_reset_index( self, @@ -9049,7 +9104,6 @@ def _make_fill_expression_for_column_wise_fillna( col_pos = self._modin_frame.data_column_snowflake_quoted_identifiers.index( snowflake_quoted_identifier ) - # If we are looking at the first column and doing an ffill, or looking at the last column and doing a bfill, # there are no other columns for us to coalesce with, so returning coalesce will error since it will be a # coalesce with one column. Instead, we just return the column. @@ -9057,7 +9111,6 @@ def _make_fill_expression_for_column_wise_fillna( col_pos == len_ids - 1 and not method_is_ffill ): return col(snowflake_quoted_identifier) - if method_is_ffill: start_pos = 0 if limit is not None: @@ -9134,8 +9187,6 @@ def fillna( axis: Optional[Axis] = None, limit: Optional[int] = None, downcast: Optional[dict] = None, - _filter_column_snowflake_quoted_id: Optional[str] = None, - _columns_to_ignore: Optional[list[bool]] = None, ) -> "SnowflakeQueryCompiler": """ Replace NaN values using provided method. @@ -9147,14 +9198,53 @@ def fillna( axis : {0, 1} limit : int, optional downcast : dict, optional - _filter_column_snowflake_quoted_id : str, optional - _columns_to_ignore : list[bool], optional **kwargs : dict Serves the compatibility purpose. Does not affect the result. Returns ------- - BaseQueryCompiler + SnowflakeQueryCompiler + New QueryCompiler with all null values filled. + """ + return self._fillna_with_masking( + value=value, + self_is_series=self_is_series, + method=method, + axis=axis, + limit=limit, + downcast=downcast, + ) + + def _fillna_with_masking( + self, + value: Optional[Union[Hashable, Mapping, "pd.DataFrame", "pd.Series"]] = None, + *, + self_is_series: bool, + method: Optional[FillnaOptions] = None, + axis: Optional[Axis] = None, + limit: Optional[int] = None, + downcast: Optional[dict] = None, + row_mask_snowflake_quoted_identifier: Optional[str] = None, + columns_mask: Optional[list[bool]] = None, + ) -> "SnowflakeQueryCompiler": + """ + Replace NaN values using provided method. + + Parameters + ---------- + value : scalar or dict + method : {"backfill", "bfill", "pad", "ffill", None} + axis : {0, 1} + limit : int, optional + downcast : dict, optional + row_mask_snowflake_quoted_identifier : str, optional + columns_mask : list[bool], optional + **kwargs : dict + Serves the compatibility purpose. Does not affect the result. + + Returns + ------- + SnowflakeQueryCompiler New QueryCompiler with all null values filled. """ if value is not None and limit is not None: @@ -9205,14 +9295,14 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: ), ) - if _filter_column_snowflake_quoted_id: + if row_mask_snowflake_quoted_identifier: # This is an internal argument passed in by reindex # that specifies a column to filter on when doing fillna # columns that this filter is True for should have their # NA values ignored. fillna_column_map = { snowflake_quoted_id: iff( - col(_filter_column_snowflake_quoted_id), + col(row_mask_snowflake_quoted_identifier), col(snowflake_quoted_id), fillna_expr(snowflake_quoted_id), ) @@ -9225,7 +9315,7 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: for snowflake_quoted_id in self._modin_frame.data_column_snowflake_quoted_identifiers } else: - if _columns_to_ignore is None: + if columns_mask is None: fillna_column_map = { snowflake_quoted_id: self._make_fill_expression_for_column_wise_fillna( snowflake_quoted_id, @@ -9240,12 +9330,12 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: snowflake_quoted_id, method, limit=limit, - _columns_to_ignore=_columns_to_ignore, + _columns_to_ignore=columns_mask, ) for i, snowflake_quoted_id in enumerate( self._modin_frame.data_column_snowflake_quoted_identifiers ) - if not _columns_to_ignore[i] + if not columns_mask[i] } # case 3: fillna with a mapping else: @@ -9282,10 +9372,10 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: include_index=False, ) fillna_column_map = {} - if _columns_to_ignore is not None: + if columns_mask is not None: columns_to_ignore = itertools.compress( self._modin_frame.data_column_pandas_labels, - _columns_to_ignore, + columns_mask, ) else: columns_to_ignore = [] # type:ignore [assignment] @@ -9293,11 +9383,11 @@ def fillna_expr(snowflake_quoted_id: str) -> SnowparkColumn: if label not in columns_to_ignore: for id in id_tuple: val = label_to_value_map[label] - if _filter_column_snowflake_quoted_id is None: + if row_mask_snowflake_quoted_identifier is None: fillna_column_map[id] = coalesce(id, pandas_lit(val)) else: fillna_column_map[id] = iff( - col(_filter_column_snowflake_quoted_id), + col(row_mask_snowflake_quoted_identifier), col(id), coalesce(id, pandas_lit(val)), ) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index 134941f5165..ee067a41fc6 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -13,6 +13,23 @@ from tests.integ.modin.utils import eval_snowpark_pandas_result +@pytest.mark.parametrize("axis", [0, 1]) +def test_reindex_invalid_limit_parameter(axis): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), index=list("ABC"), columns=list("ABC") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(axis=axis, labels=list("CAB"), fill_value=1, limit=1), + expect_exception=True, + expect_exception_match="limit argument only valid if doing pad, backfill or nearest reindexing", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + class TestReindexAxis0: @sql_count_checker(query_count=1, join_count=1) def test_reindex_index_basic_reorder(self): diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py new file mode 100644 index 00000000000..956eb4593c9 --- /dev/null +++ b/tests/integ/modin/series/test_reindex.py @@ -0,0 +1,335 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import re + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result + + +def test_reindex_invalid_limit_parameter(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CAB"), fill_value=1, limit=1), + expect_exception=True, + expect_exception_match="limit argument only valid if doing pad, backfill or nearest reindexing", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + +def test_reindex_invalid_axis_parameter_ignored(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(axis=1, index=list("CAB")), + ) + + +def test_reindex_invalid_labels_parameter(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(labels=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape( + "Series.reindex() got an unexpected keyword argument 'labels'" + ), + expect_exception_type=TypeError, + assert_exception_equal=True, + ) + + +def test_reindex_index_passed_twice(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(list("CAB"), index=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape( + "Series.reindex() got multiple values for argument 'index'" + ), + expect_exception_type=TypeError, + assert_exception_equal=True, + ) + + +def test_reindex_multiple_args_passed(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(list("CAB"), index=list("CAB")), + expect_exception=True, + expect_exception_match=re.escape( + "Series.reindex() got multiple values for argument 'index'" + ), + expect_exception_type=TypeError, + assert_exception_equal=True, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_reorder(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CAB")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CABDEF")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_remove_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CA")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_basic_add_remove_elements(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("CADEFG")) + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CADEFG"), fill_value=-1.0), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +def test_reindex_index_fill_method(method, limit): + native_series = native_pd.Series([0, 1, 2], index=list("ACE")) + snow_series = pd.Series(native_series) + + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex( + index=list("ABCDEFGH"), method=method, limit=limit + ), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +def test_reindex_index_fill_method_pandas_negative(method, limit): + # The negative is because pandas does not support `limit` parameter + # if the index and target are not both monotonic, while we do. + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_series.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex(index=list("CADEFG"), method=method, limit=limit) + else: + return series.reindex( + index=list("ACDEFG"), method=method, limit=limit + ).reindex(list("CADEFG")) + + eval_snowpark_pandas_result( + snow_series, + native_series, + perform_reindex, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_ordered_index_unordered_new_index(): + ordered_native_Series = native_pd.Series([5, 6, 8], index=[5, 6, 8]) + ordered_snow_series = pd.Series(ordered_native_Series) + eval_snowpark_pandas_result( + ordered_snow_series, + ordered_native_Series, + lambda series: series.reindex(index=[6, 8, 7], method="ffill"), + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_fill_value_with_old_na_values(): + native_series = native_pd.Series([1, np.nan, 3], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex(index=list("CEBFGA"), fill_value=-1), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +@pytest.mark.xfail(reason="Cannot qualify window in fillna.") +def test_reindex_index_fill_method_with_old_na_values(limit, method): + # Say there are NA values in the data before reindex. reindex is called with + # ffill as the method, and there is a new NA value in the row following the + # row with the pre-existing NA value. In this case, the ffilled value should + # be an NA value (rather than looking to previous rows for a non-NA value). + # To support this, we would need to have `ignore_nulls=False`, but we would + # also need to qualify the window for values that were in the original Series + # as otherwise, if we have multiple new index values that have NA values, we would + # pick these NA values instead of finding a non-NA value from the original Series. + native_series = native_pd.Series([1, np.nan, 3], index=list("ACE")) + snow_series = pd.Series(native_series) + + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda series: series.reindex( + index=list("ABCDEFGH"), method=method, limit=limit + ), + ) + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_fill_method_with_old_na_values_pandas_negative(limit, method): + native_series = native_pd.Series([1, np.nan, 3], index=list("ABC")) + snow_series = pd.Series(native_series) + if limit is not None: + method_str = {"bfill": "backfill", "ffill": "pad"}.get(method, method) + with pytest.raises( + ValueError, + match=f"limit argument for '{method_str}' method only well-defined if index and target are monotonic", + ): + native_series.reindex(index=list("CEBFGA"), method=method, limit=limit) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex(index=list("CEBFGA"), method=method, limit=limit) + else: + return series.reindex( + index=list("ABCEFG"), method=method, limit=limit + ).reindex(list("CEBFGA")) + + eval_snowpark_pandas_result( + snow_series, + native_series, + perform_reindex, + ) + + +@sql_count_checker(query_count=2, join_count=1) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +def test_reindex_index_datetime_with_fill(limit, method): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_series = native_pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex( + pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + else: + return series.reindex( + native_pd.date_range("12/29/2009", periods=10, freq="D"), + method=method, + limit=limit, + ) + + eval_snowpark_pandas_result( + snow_series, native_series, perform_reindex, check_freq=False + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_reindex_index_non_overlapping_index(): + native_series = native_pd.Series([0, 1, 2], index=list("ABC")) + snow_series = pd.Series(native_series) + eval_snowpark_pandas_result( + snow_series, native_series, lambda series: series.reindex(index=list("EFG")) + ) + + +@sql_count_checker(query_count=2, join_count=1) +def test_reindex_index_non_overlapping_datetime_index(): + date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") + native_series = native_pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + def perform_reindex(series): + if isinstance(series, pd.Series): + return series.reindex( + pd.date_range("12/29/2008", periods=10, freq="D"), + ) + else: + return series.reindex( + native_pd.date_range("12/29/2008", periods=10, freq="D"), + ) + + eval_snowpark_pandas_result( + snow_series, native_series, perform_reindex, check_freq=False + ) + + +@sql_count_checker(query_count=1) +def test_reindex_index_non_overlapping_different_types_index_negative(): + date_index = pd.date_range("1/1/2010", periods=6, freq="D") + snow_series = pd.Series( + {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ) + + with pytest.raises(SnowparkSQLException, match=".*Timestamp 'A' is not recognized"): + snow_series.reindex(list("ABC")).to_pandas() From 299885f26da6e48f897dfc0d7b857ce0c358bf50 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Thu, 11 Jul 2024 17:57:56 -0700 Subject: [PATCH 14/20] Add sentinel --- .../plugin/compiler/snowflake_query_compiler.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 51b3b1e271b..e13be2c2a6a 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2410,8 +2410,17 @@ def _reindex_axis_1( limit = kwargs.get("limit", None) tolerance = kwargs.get("tolerance", None) fill_value = kwargs.get("fill_value", np.nan) # type: ignore[arg-type] - if not self.columns.is_lazy: - self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) + # Currently, our error checking relies on the column axis being eager (i.e. stored + # locally as a pandas Index, rather than pushed down to the database). This allows + # us to have parity with native pandas for things like monotonicity checks. If + # our columns are no longer eagerly stored, we would no longer be able to rely + # on pandas for these error checks, and the behaviour of reindex would change. + # This change is user-facing, so we should catch this in CI first, which we can + # by having this assert here, as a sentinel. + assert ( + not self.columns.is_lazy() + ), "`reindex` with axis=1 failed on error checking." + self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) data_column_pandas_labels = [] data_column_snowflake_quoted_identifiers = [] modin_frame = self._modin_frame From 2d2c04699407af596f6f499ba42b7a8e5ff90bf3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 13:35:18 -0700 Subject: [PATCH 15/20] Add docstrings --- .../modin/supported/dataframe_supported.rst | 3 +- .../modin/supported/series_supported.rst | 3 +- .../modin/plugin/docstrings/dataframe.py | 205 +++++++++++++++++- .../modin/plugin/docstrings/series.py | 205 +++++++++++++++++- 4 files changed, 412 insertions(+), 4 deletions(-) diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index 762e825bafd..4237aa78ab3 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -330,7 +330,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdiv`` | P | ``level`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | P | | ``N`` if axis is MultiIndex. | +| ``reindex`` | P | | ``N`` if axis is MultiIndex or method is | +| | | | ``nearest``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst index 71c9de2e2b2..45abf4069b0 100644 --- a/docs/source/modin/supported/series_supported.rst +++ b/docs/source/modin/supported/series_supported.rst @@ -322,7 +322,8 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``rdivmod`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``reindex`` | P | | ``N`` if the series has MultiIndex | +| ``reindex`` | P | | ``N`` if the series has MultiIndex, or method | +| | | | is ``nearest``. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``reindex_like`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 441121f9363..e7ef078cc61 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -2709,7 +2709,210 @@ def rename(): """ def reindex(): - pass + """ + Conform DataFrame to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object is produced + unless the new index is equivalent to the current one and copy=False. + + Parameters + ---------- + index : array-like, optional + New labels for the index. + axis : int or str, optional + Unused. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"}, default: None + Method to use for filling holes in reindexed DataFrame. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. Unsupported by Snowpark pandas. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + level : int or name + Broadcast across a level, matching Index values on the passed MultiIndex level. + + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any “compatible” value. + + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + + tolerance : optional + Maximum distance between original and new labels for inexact matches. + The values of the index at the matching locations most satisfy the + equation abs(index[indexer] - target) <= tolerance. Unsupported by + Snowpark pandas. + + Returns + ------- + DataFrame + DataFrame with changed index. + + Notes + ----- + For axis 0, Snowpark pandas' behaviour diverges from vanilla pandas in order + to maintain Snowpark's lazy execution paradigm. The behaviour changes are as follows: + + * Snowpark pandas does not error if the existing index is not monotonically increasing + or decreasing when `method` is specified for filling. It instead assumes that + the index is monotonically increasing, performs the reindex, and fills the values + as though the index is sorted (which involves sorting internally). + * Snowpark pandas does not error out if there are duplicates - they are included in the + output. + * Snowpark pandas does not error if a `limit` value is passed and the new index is not + monotonically increasing or decreasing - instead, it reindexes, sorts the new index, + fills using limit, and then reorders the data to be in the correct order (the order + of the target labels passed in to the method). + + For axis 1, Snowpark pandas' error checking remains the same as vanilla pandas. + + MultiIndex is currently unsupported. + + ``method="nearest"`` is currently unsupported. + + Examples + -------- + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + ... df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + ... df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default, values in the new index + that do not have corresponding records in the dataframe are assigned NaN. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + ... df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to the keyword fill_value. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value='missing') + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use “axis-style” keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in reindex, we will create a dataframe + with a monotonically increasing index (for example, a sequence of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + ... df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + ... df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + ... df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame (for example, + ``2009-12-29``) are by default filled with NaN. If desired, we can fill in the missing + values using one of several options. + + For example, to back-propagate the last valid value to fill the NaN values, pass bfill as + an argument to the method keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the NaN value present in the original dataframe (at index value 2010-01-03) will + not be filled by any of the value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and desired indexes. If you do + want to fill in the NaN values present in the original dataframe, use the fillna() method. + + An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. + >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) + ... unordered_dataframe + A B C + 5 5 5 5 + 8 8 8 8 + 6 6 6 6 + >>> unordered_dataframe.reindex([6, 8, 7], method="ffill") + A B C + 6 6 6 6 + 8 8 8 8 + 7 6 6 6 + + In the example above, index value ``7`` is forward filled from index value ``6``, since that + is the previous index value when the data is sorted. + """ def replace(): """ diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index dfc72f7ebc8..b9e5f29a991 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -1892,7 +1892,210 @@ def ravel(): """ def reindex(): - pass + """ + Conform Series to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object is produced + unless the new index is equivalent to the current one and copy=False. + + Parameters + ---------- + index : array-like, optional + New labels for the index. + axis : int or str, optional + Unused. + method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"}, default: None + Method to use for filling holes in reindexed DataFrame. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. Unsupported by Snowpark pandas. + + copy : bool, default True + Return a new object, even if the passed indexes are the same. + + level : int or name + Broadcast across a level, matching Index values on the passed MultiIndex level. + + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any “compatible” value. + + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + + tolerance : optional + Maximum distance between original and new labels for inexact matches. + The values of the index at the matching locations most satisfy the + equation abs(index[indexer] - target) <= tolerance. Unsupported by + Snowpark pandas. + + Returns + ------- + Series + Series with changed index. + + Notes + ----- + For axis 0, Snowpark pandas' behaviour diverges from vanilla pandas in order + to maintain Snowpark's lazy execution paradigm. The behaviour changes are as follows: + + * Snowpark pandas does not error if the existing index is not monotonically increasing + or decreasing when `method` is specified for filling. It instead assumes that + the index is monotonically increasing, performs the reindex, and fills the values + as though the index is sorted (which involves sorting internally). + * Snowpark pandas does not error out if there are duplicates - they are included in the + output. + * Snowpark pandas does not error if a `limit` value is passed and the new index is not + monotonically increasing or decreasing - instead, it reindexes, sorts the new index, + fills using limit, and then reorders the data to be in the correct order (the order + of the target labels passed in to the method). + + For axis 1, Snowpark pandas' error checking remains the same as vanilla pandas. + + MultiIndex is currently unsupported. + + ``method="nearest"`` is currently unsupported. + + Examples + -------- + Create a dataframe with some fictional data. + + >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] + ... df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) + ... df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the dataframe. By default, values in the new index + that do not have corresponding records in the dataframe are assigned NaN. + + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] + ... df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to the keyword fill_value. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value='missing') + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=['http_status', 'user_agent']) + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use “axis-style” keyword arguments + + >>> df.reindex(['http_status', 'user_agent'], axis="columns") + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in reindex, we will create a dataframe + with a monotonically increasing index (for example, a sequence of dates). + + >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') + ... df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + ... index=date_index) + ... df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the dataframe to cover a wider date range. + + >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') + ... df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame (for example, + ``2009-12-29``) are by default filled with NaN. If desired, we can fill in the missing + values using one of several options. + + For example, to back-propagate the last valid value to fill the NaN values, pass bfill as + an argument to the method keyword. + + >>> df2.reindex(date_index2, method='bfill') + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the NaN value present in the original dataframe (at index value 2010-01-03) will + not be filled by any of the value propagation schemes. This is because filling while reindexing + does not look at dataframe values, but only compares the original and desired indexes. If you do + want to fill in the NaN values present in the original dataframe, use the fillna() method. + + An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. + >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) + ... unordered_dataframe + A B C + 5 5 5 5 + 8 8 8 8 + 6 6 6 6 + >>> unordered_dataframe.reindex([6, 8, 7], method="ffill") + A B C + 6 6 6 6 + 8 8 8 8 + 7 6 6 6 + + In the example above, index value ``7`` is forward filled from index value ``6``, since that + is the previous index value when the data is sorted. + """ def rename_axis(): """ From d85b936f9ba64231f6bf39b2894215748911f279 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 14:02:09 -0700 Subject: [PATCH 16/20] Fix tests --- .../compiler/snowflake_query_compiler.py | 2 +- tests/integ/modin/frame/test_reindex.py | 51 ++++++++++++++++++- tests/integ/modin/series/test_reindex.py | 33 +++++++++++- 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 7c0465ec0c9..e234d9d2ff1 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2418,7 +2418,7 @@ def _reindex_axis_1( # This change is user-facing, so we should catch this in CI first, which we can # by having this assert here, as a sentinel. assert ( - not self.columns.is_lazy() + not self.columns.is_lazy ), "`reindex` with axis=1 failed on error checking." self.columns.to_pandas().reindex(labels, method, level, limit, tolerance) data_column_pandas_labels = [] diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index ee067a41fc6..f522b6f9ea4 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -10,7 +10,10 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, + eval_snowpark_pandas_result, +) @pytest.mark.parametrize("axis", [0, 1]) @@ -281,6 +284,33 @@ def test_reindex_index_non_overlapping_different_types_index_negative(self): ): snow_df.reindex(list("ABC")).to_pandas() + @sql_count_checker(query_count=1, join_count=1) + @pytest.mark.parametrize( + "new_index", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] + ) + def test_reindex_index_duplicate_values(self, new_index): + native_df = native_pd.DataFrame(np.arange(9).reshape((3, 3)), index=list("ABA")) + snow_df = pd.DataFrame(native_df) + + with pytest.raises( + ValueError, match="cannot reindex on an axis with duplicate labels" + ): + native_df.reindex(index=new_index) + + dfs_to_concat = [] + for row in new_index: + if row not in list("AB"): + dfs_to_concat.append(native_pd.DataFrame([[np.nan] * 3], index=[row])) + else: + value = native_df.loc[row] + if isinstance(value, native_pd.Series): + value = native_pd.DataFrame(value).T + dfs_to_concat.append(value) + result_native_df = native_pd.concat(dfs_to_concat) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_df.reindex(index=new_index), result_native_df + ) + class TestReindexAxis1: @sql_count_checker(query_count=1) @@ -503,3 +533,22 @@ def test_reindex_columns_non_overlapping_different_types_columns(self): eval_snowpark_pandas_result( snow_df, native_df, lambda df: df.reindex(columns=list("ABCD")) ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize( + "new_columns", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] + ) + def test_reindex_columns_duplicate_values(self, new_columns): + native_df = native_pd.DataFrame( + np.arange(9).reshape((3, 3)), columns=list("ABA") + ) + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.reindex(columns=new_columns), + expect_exception=True, + expect_exception_match="cannot reindex on an axis with duplicate labels", + assert_exception_equal=True, + expect_exception_type=ValueError, + ) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 956eb4593c9..6207a73049a 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -12,7 +12,10 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.sql_counter import sql_count_checker -from tests.integ.modin.utils import eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, + eval_snowpark_pandas_result, +) def test_reindex_invalid_limit_parameter(): @@ -333,3 +336,31 @@ def test_reindex_index_non_overlapping_different_types_index_negative(): with pytest.raises(SnowparkSQLException, match=".*Timestamp 'A' is not recognized"): snow_series.reindex(list("ABC")).to_pandas() + + +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize( + "new_index", [list("ABC"), list("ABCC"), list("ABBC"), list("AABC")] +) +def test_reindex_index_duplicate_values(new_index): + native_series = native_pd.Series([0, 1, 2], index=list("ABA")) + snow_series = pd.Series(native_series) + + with pytest.raises( + ValueError, match="cannot reindex on an axis with duplicate labels" + ): + native_series.reindex(index=new_index) + + series_to_concat = [] + for row in new_index: + if row not in list("AB"): + series_to_concat.append(native_pd.Series([np.nan], index=[row])) + else: + value = native_series.loc[row] + if not isinstance(value, native_pd.Series): + value = native_pd.Series([value], index=[row]) + series_to_concat.append(value) + result_native_series = native_pd.concat(series_to_concat) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_series.reindex(index=new_index), result_native_series + ) From 84e5fdcb18a33101ae7d180f7ccd500faa0152e3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 14:11:41 -0700 Subject: [PATCH 17/20] Fix tests --- tests/integ/modin/frame/test_reindex.py | 17 +++++++++++++++++ tests/integ/modin/series/test_reindex.py | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index f522b6f9ea4..692fd66471f 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -16,6 +16,7 @@ ) +@sql_count_checker(query_count=0) @pytest.mark.parametrize("axis", [0, 1]) def test_reindex_invalid_limit_parameter(axis): native_df = native_pd.DataFrame( @@ -552,3 +553,19 @@ def test_reindex_columns_duplicate_values(self, new_columns): assert_exception_equal=True, expect_exception_type=ValueError, ) + + +@sql_count_checker(query_count=0) +@pytest.mark.parametrize("axis", [0, 1]) +def test_reindex_multiindex_negative(axis): + snow_df = pd.DataFrame(np.arange(9).reshape((3, 3))) + snow_df = snow_df.set_index([0, 1]) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't support `reindex` with MultiIndex", + ): + if axis == 0: + snow_df.reindex(index=[1, 2, 3]) + else: + snow_df.T.reindex(columns=[1, 2, 3]) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 6207a73049a..78d2986a366 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -18,6 +18,7 @@ ) +@sql_count_checker(query_count=0) def test_reindex_invalid_limit_parameter(): native_series = native_pd.Series([0, 1, 2], index=list("ABC")) snow_series = pd.Series(native_series) @@ -32,6 +33,7 @@ def test_reindex_invalid_limit_parameter(): ) +@sql_count_checker(query_count=1, join_count=1) def test_reindex_invalid_axis_parameter_ignored(): native_series = native_pd.Series([0, 1, 2], index=list("ABC")) snow_series = pd.Series(native_series) @@ -42,6 +44,7 @@ def test_reindex_invalid_axis_parameter_ignored(): ) +@sql_count_checker(query_count=0) def test_reindex_invalid_labels_parameter(): native_series = native_pd.Series([0, 1, 2], index=list("ABC")) snow_series = pd.Series(native_series) @@ -58,6 +61,7 @@ def test_reindex_invalid_labels_parameter(): ) +@sql_count_checker(query_count=0) def test_reindex_index_passed_twice(): native_series = native_pd.Series([0, 1, 2], index=list("ABC")) snow_series = pd.Series(native_series) @@ -74,6 +78,7 @@ def test_reindex_index_passed_twice(): ) +@sql_count_checker(query_count=0) def test_reindex_multiple_args_passed(): native_series = native_pd.Series([0, 1, 2], index=list("ABC")) snow_series = pd.Series(native_series) @@ -364,3 +369,16 @@ def test_reindex_index_duplicate_values(new_index): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_series.reindex(index=new_index), result_native_series ) + + +@sql_count_checker(query_count=0) +def test_reindex_multiindex_negative(): + snow_series = pd.Series( + [0, 1, 2], index=native_pd.MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)]) + ) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas doesn't support `reindex` with MultiIndex", + ): + snow_series.reindex(index=[1, 2, 3]) From 1059b38f3b939babe70a6b85e3de5cf7dd7f001e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 14:58:26 -0700 Subject: [PATCH 18/20] Fix docstrings --- .../modin/plugin/docstrings/dataframe.py | 56 +++++++++---------- .../modin/plugin/docstrings/series.py | 56 +++++++++---------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 8e628ec52c4..e4373ed5cf6 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -2855,10 +2855,10 @@ def reindex(): Create a dataframe with some fictional data. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - ... df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, ... index=index) - ... df + >>> df http_status response_time Firefox 200 0.04 Chrome 200 0.02 @@ -2871,7 +2871,7 @@ def reindex(): >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', ... 'Chrome'] - ... df.reindex(new_index) + >>> df.reindex(new_index) http_status response_time Safari 404.0 0.07 Iceweasel NaN NaN @@ -2889,41 +2889,41 @@ def reindex(): IE10 404 0.08 Chrome 200 0.02 - >>> df.reindex(new_index, fill_value='missing') - http_status response_time - Safari 404 0.07 - Iceweasel missing missing - Comodo Dragon missing missing - IE10 404 0.08 - Chrome 200 0.02 + >>> df.reindex(new_index, fill_value=-1) # doctest: +NORMALIZE_WHITESPACE + http_status response_time + Safari 404 0.07 + Iceweasel -1 -1.00 + Comodo Dragon -1 -1.00 + IE10 404 0.08 + Chrome 200 0.02 We can also reindex the columns. - >>> df.reindex(columns=['http_status', 'user_agent']) - http_status user_agent - Firefox 200 NaN - Chrome 200 NaN - Safari 404 NaN - IE10 404 NaN - Konqueror 301 NaN + >>> df.reindex(columns=['http_status', 'user_agent']) # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None Or we can use “axis-style” keyword arguments - >>> df.reindex(['http_status', 'user_agent'], axis="columns") - http_status user_agent - Firefox 200 NaN - Chrome 200 NaN - Safari 404 NaN - IE10 404 NaN - Konqueror 301 NaN + >>> df.reindex(['http_status', 'user_agent'], axis="columns") # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None To further illustrate the filling functionality in reindex, we will create a dataframe with a monotonically increasing index (for example, a sequence of dates). >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') - ... df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, ... index=date_index) - ... df2 + >>> df2 prices 2010-01-01 100.0 2010-01-02 101.0 @@ -2935,7 +2935,7 @@ def reindex(): Suppose we decide to expand the dataframe to cover a wider date range. >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') - ... df2.reindex(date_index2) + >>> df2.reindex(date_index2) prices 2009-12-29 NaN 2009-12-30 NaN @@ -2975,7 +2975,7 @@ def reindex(): An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) - ... unordered_dataframe + >>> unordered_dataframe A B C 5 5 5 5 8 8 8 8 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 5b0ef343eb0..65af39dec1c 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -2046,10 +2046,10 @@ def reindex(): Create a dataframe with some fictional data. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - ... df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, ... index=index) - ... df + >>> df http_status response_time Firefox 200 0.04 Chrome 200 0.02 @@ -2062,7 +2062,7 @@ def reindex(): >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', ... 'Chrome'] - ... df.reindex(new_index) + >>> df.reindex(new_index) http_status response_time Safari 404.0 0.07 Iceweasel NaN NaN @@ -2080,41 +2080,41 @@ def reindex(): IE10 404 0.08 Chrome 200 0.02 - >>> df.reindex(new_index, fill_value='missing') - http_status response_time - Safari 404 0.07 - Iceweasel missing missing - Comodo Dragon missing missing - IE10 404 0.08 - Chrome 200 0.02 + >>> df.reindex(new_index, fill_value=-1) # doctest: +NORMALIZE_WHITESPACE + http_status response_time + Safari 404 0.07 + Iceweasel -1 -1.00 + Comodo Dragon -1 -1.00 + IE10 404 0.08 + Chrome 200 0.02 We can also reindex the columns. - >>> df.reindex(columns=['http_status', 'user_agent']) - http_status user_agent - Firefox 200 NaN - Chrome 200 NaN - Safari 404 NaN - IE10 404 NaN - Konqueror 301 NaN + >>> df.reindex(columns=['http_status', 'user_agent']) # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None Or we can use “axis-style” keyword arguments - >>> df.reindex(['http_status', 'user_agent'], axis="columns") - http_status user_agent - Firefox 200 NaN - Chrome 200 NaN - Safari 404 NaN - IE10 404 NaN - Konqueror 301 NaN + >>> df.reindex(['http_status', 'user_agent'], axis="columns") # doctest: +NORMALIZE_WHITESPACE + http_status user_agent + Firefox 200 None + Chrome 200 None + Safari 404 None + IE10 404 None + Konqueror 301 None To further illustrate the filling functionality in reindex, we will create a dataframe with a monotonically increasing index (for example, a sequence of dates). >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') - ... df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, ... index=date_index) - ... df2 + >>> df2 prices 2010-01-01 100.0 2010-01-02 101.0 @@ -2126,7 +2126,7 @@ def reindex(): Suppose we decide to expand the dataframe to cover a wider date range. >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') - ... df2.reindex(date_index2) + >>> df2.reindex(date_index2) prices 2009-12-29 NaN 2009-12-30 NaN @@ -2166,7 +2166,7 @@ def reindex(): An example illustrating Snowpark pandas' behavior when dealing with non-monotonic indices. >>> unordered_dataframe = pd.DataFrame([[5]*3, [8]*3, [6]*3], columns=list("ABC"), index=[5, 8, 6]) - ... unordered_dataframe + >>> unordered_dataframe A B C 5 5 5 5 8 8 8 8 From 37d4a48736142160e283fe0842ff028b34db8023 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 15:45:14 -0700 Subject: [PATCH 19/20] Fix kwarg test --- tests/integ/modin/series/test_reindex.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 78d2986a366..e17b66d3a66 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -53,9 +53,7 @@ def test_reindex_invalid_labels_parameter(): native_series, lambda series: series.reindex(labels=list("CAB")), expect_exception=True, - expect_exception_match=re.escape( - "Series.reindex() got an unexpected keyword argument 'labels'" - ), + expect_exception_match=re.escape("got an unexpected keyword argument 'labels'"), expect_exception_type=TypeError, assert_exception_equal=True, ) @@ -70,9 +68,7 @@ def test_reindex_index_passed_twice(): native_series, lambda series: series.reindex(list("CAB"), index=list("CAB")), expect_exception=True, - expect_exception_match=re.escape( - "Series.reindex() got multiple values for argument 'index'" - ), + expect_exception_match=re.escape("got multiple values for argument 'index'"), expect_exception_type=TypeError, assert_exception_equal=True, ) @@ -87,9 +83,7 @@ def test_reindex_multiple_args_passed(): native_series, lambda series: series.reindex(list("CAB"), index=list("CAB")), expect_exception=True, - expect_exception_match=re.escape( - "Series.reindex() got multiple values for argument 'index'" - ), + expect_exception_match=re.escape("got multiple values for argument 'index'"), expect_exception_type=TypeError, assert_exception_equal=True, ) From 6e57e4d3f75073aac03706628e2d88f75a736045 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 15 Jul 2024 16:18:22 -0700 Subject: [PATCH 20/20] Fix kwarg test --- tests/integ/modin/series/test_reindex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index e17b66d3a66..7c2bbba906e 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -55,7 +55,7 @@ def test_reindex_invalid_labels_parameter(): expect_exception=True, expect_exception_match=re.escape("got an unexpected keyword argument 'labels'"), expect_exception_type=TypeError, - assert_exception_equal=True, + assert_exception_equal=False, ) @@ -70,7 +70,7 @@ def test_reindex_index_passed_twice(): expect_exception=True, expect_exception_match=re.escape("got multiple values for argument 'index'"), expect_exception_type=TypeError, - assert_exception_equal=True, + assert_exception_equal=False, ) @@ -85,7 +85,7 @@ def test_reindex_multiple_args_passed(): expect_exception=True, expect_exception_match=re.escape("got multiple values for argument 'index'"), expect_exception_type=TypeError, - assert_exception_equal=True, + assert_exception_equal=False, )