Skip to content

Commit

Permalink
SNOW-1642293 Add support for lazy index labels in reindex and fix rei…
Browse files Browse the repository at this point in the history
…ndex name bug (#2175)

<!---
Please answer these questions before creating your pull request. Thanks!
--->

1. Which Jira issue is this PR addressing? Make sure that there is an
accompanying issue to your PR.

   <!---
   In this section, please add a Snowflake Jira issue number.
   
Note that if a corresponding GitHub issue exists, you should still
include
   the Snowflake Jira issue number. For example, for GitHub issue
#1400, you should
   add "SNOW-1335071" here.
    --->

   Fixes SNOW-1642293

2. Fill out the following pre-review checklist:

- [x] I am adding a new automated test(s) to verify correctness of my
new code
- [ ] If this test skips Local Testing mode, I'm requesting review from
@snowflakedb/local-testing
   - [ ] I am adding new logging messages
   - [ ] I am adding a new telemetry message
   - [ ] I am adding new credentials
   - [ ] I am adding a new dependency
- [ ] If this is a new feature/behavior, I'm adding the Local Testing
parity changes.

3. Please describe how your code solves the related issue.

1. Added support for using lazy index labels with reindex.
```py
>>> ser = pd.Series([1, 2, 3], index=["A", "B", "C"])
>>> idx = pd.Index(["X", "Y", "Z"])
>>> ser.reindex(idx)
X   NaN
Y   NaN
Z   NaN
dtype: float64
```

2. `reindex` has a bug - if it is performed with an index `idx` which
has a name, it does not update the result series'/df's index name
accordingly. The name remains None.

For instance,
```py
>>> ser = pd.Series([0, 1, 2], index=list("ABC"), name="test")
>>> idx = native_pd.Index(list("CAB"), name="weewoo")
>>> snow_series.reindex(index=idx)
C    2
A    0
B    1
Name: test, dtype: int64

# Instead of:
weewoo
C      2
A      0
B      1
Name: test, dtype: int64
``` 

3. Fixed a bug where `Index` objects name was not set correctly during
binary operations.

---------

Co-authored-by: Andong Zhan <[email protected]>
  • Loading branch information
sfc-gh-vbudati and sfc-gh-azhan authored Sep 4, 2024
1 parent afbc6f6 commit 71a2182
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
- Added support for `Index.is_monotonic_increasing` and `Index.is_monotonic_decreasing`.
- Added support for `pd.crosstab`.
- Added support for `pd.bdate_range` and included business frequency support (B, BME, BMS, BQE, BQS, BYE, BYS) for both `pd.date_range` and `pd.bdate_range`.
- Added support for lazy `Index` objects as `labels` in `DataFrame.reindex` and `Series.reindex`.

#### Improvements

Expand All @@ -97,6 +98,7 @@
- Fixed AssertionError in tree of binary operations.
- Fixed bug in `Series.dt.isocalendar` using a named Series
- Fixed `inplace` argument for Series objects derived from DataFrame columns.
- Fixed a bug where `Series.reindex` and `DataFrame.reindex` did not update the result index's name correctly.

#### Behavior Change

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2260,7 +2260,7 @@ def any(
def reindex(
self,
axis: int,
labels: Union[pandas.Index, list[Any]],
labels: Union[pandas.Index, "pd.Index", list[Any]],
**kwargs: dict[str, Any],
) -> "SnowflakeQueryCompiler":
"""
Expand Down Expand Up @@ -2468,7 +2468,7 @@ def _add_columns_for_monotonicity_checks(

def _reindex_axis_0(
self,
labels: Union[pandas.Index, list[Any]],
labels: Union[pandas.Index, "pd.Index", list[Any]],
**kwargs: dict[str, Any],
) -> "SnowflakeQueryCompiler":
"""
Expand All @@ -2494,7 +2494,13 @@ def _reindex_axis_0(
"""
self._raise_not_implemented_error_for_timedelta()

new_index_qc = pd.Series(labels)._query_compiler
if isinstance(labels, native_pd.Index):
labels = pd.Index(labels)
if isinstance(labels, pd.Index):
new_index_qc = labels.to_series()._query_compiler
else:
new_index_qc = pd.Series(labels)._query_compiler

new_index_modin_frame = new_index_qc._modin_frame
modin_frame = self._modin_frame
method = kwargs.get("method", None)
Expand Down Expand Up @@ -2583,7 +2589,7 @@ def _reindex_axis_0(
data_column_pandas_labels=data_column_pandas_labels,
data_column_snowflake_quoted_identifiers=data_column_snowflake_quoted_identifiers,
data_column_pandas_index_names=modin_frame.data_column_pandas_index_names,
index_column_pandas_labels=modin_frame.index_column_pandas_labels,
index_column_pandas_labels=new_index_modin_frame.index_column_pandas_labels,
index_column_snowflake_quoted_identifiers=result_frame_column_mapper.map_left_quoted_identifiers(
new_index_modin_frame.data_column_snowflake_quoted_identifiers
),
Expand Down
43 changes: 43 additions & 0 deletions tests/integ/modin/frame/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,3 +569,46 @@ def test_reindex_multiindex_negative(axis):
snow_df.reindex(index=[1, 2, 3])
else:
snow_df.T.reindex(columns=[1, 2, 3])


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_index_name():
native_df = native_pd.DataFrame(
[[0, 1, 2], [0, 0, 1], [1, 0, 0]],
index=list("ABC"),
)
snow_df = pd.DataFrame(native_df)
index_with_name = native_pd.Index(list("CAB"), name="weewoo")
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_df.reindex(index=index_with_name), native_df.reindex(index=index_with_name)
)


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_index_name_and_df_index_name():
native_df = native_pd.DataFrame(
{"X": [1, 2, 3], "Y": [8, 7, 3], "Z": [3, 4, 5]},
index=native_pd.Index(list("ABC"), name="AAAAA"),
)
snow_df = pd.DataFrame(native_df)
index_with_name = native_pd.Index(list("CAB"), name="weewoo")
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_df.reindex(index=index_with_name), native_df.reindex(index=index_with_name)
)


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_lazy_index():
native_df = native_pd.DataFrame(
[[1, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], index=list("XYZ")
)
snow_df = pd.DataFrame(native_df)
native_idx = native_pd.Index(list("CAB"))
lazy_idx = pd.Index(native_idx)
eval_snowpark_pandas_result(
snow_df,
native_df,
lambda df: df.reindex(
index=native_idx if isinstance(df, native_pd.DataFrame) else lazy_idx
),
)
4 changes: 1 addition & 3 deletions tests/integ/modin/index/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ def test_ordered_index_unordered_new_index():
@pytest.mark.parametrize("limit", [None, 1, 2, 100])
@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"])
def test_datetime_with_fill(limit, method):
query_count = 2
join_count = 2
with SqlCounter(query_count=query_count, join_count=join_count):
with SqlCounter(query_count=2 if limit is None else 3, join_count=2):
native_date_index = native_pd.date_range("1/1/2010", periods=6, freq="D")
snow_date_index = pd.date_range("1/1/2010", periods=6, freq="D")
assert_reindex_result_equal(
Expand Down
39 changes: 39 additions & 0 deletions tests/integ/modin/series/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,42 @@ def test_reindex_multiindex_negative():
match="Snowpark pandas doesn't support `reindex` with MultiIndex",
):
snow_series.reindex(index=[1, 2, 3])


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_index_name():
native_series = native_pd.Series([0, 1, 2], index=list("ABC"), name="test")
snow_series = pd.Series(native_series)
index_with_name = native_pd.Index(list("CAB"), name="weewoo")
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_series.reindex(index=index_with_name),
native_series.reindex(index=index_with_name),
)


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_index_name_and_series_index_name():
native_series = native_pd.Series(
[0, 1, 2], index=native_pd.Index(list("ABC"), name="AAAAA"), name="test"
)
snow_series = pd.Series(native_series)
index_with_name = native_pd.Index(list("CAB"), name="weewoo")
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
snow_series.reindex(index=index_with_name),
native_series.reindex(index=index_with_name),
)


@sql_count_checker(query_count=1, join_count=1)
def test_reindex_with_lazy_index():
native_series = native_pd.Series([0, 1, 2], index=list("ABC"))
snow_series = pd.Series(native_series)
native_idx = native_pd.Index(list("CAB"))
lazy_idx = pd.Index(native_idx)
eval_snowpark_pandas_result(
snow_series,
native_series,
lambda series: series.reindex(
index=native_idx if isinstance(series, native_pd.Series) else lazy_idx
),
)

0 comments on commit 71a2182

Please sign in to comment.