From d8b7d870d453e37518368489bb22181f712f4887 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Thu, 8 Aug 2024 21:45:42 -0700 Subject: [PATCH] Fix tests and update doctests --- .../snowpark/modin/pandas/general.py | 2 +- src/snowflake/snowpark/modin/pandas/series.py | 5 ++- .../compiler/snowflake_query_compiler.py | 5 ++- .../modin/plugin/docstrings/series_utils.py | 32 +++++++------------ tests/integ/modin/frame/test_duplicated.py | 2 +- tests/integ/modin/test_telemetry.py | 2 +- 6 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 417d1edca5f..4161d316b0e 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2167,7 +2167,7 @@ def date_range( right_inclusive=right_inclusive, ) # Set date range as index column. - qc = qc.set_index_from_columns(qc.columns.tolist()) + qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False) # Set index column name. qc = qc.set_index_names([name]) return pd.DatetimeIndex(data=qc) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 12dc9d10972..a494b513de5 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -130,7 +130,10 @@ def __init__( # modified: # Engine.subscribe(_update_engine) - if isinstance(data, type(self)): + # Convert lazy index to Series without pulling the data to client. + if isinstance(data, pd.Index): + query_compiler = data.to_series(index=index, name=name)._query_compiler + elif isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: if any(i not in data.index for i in index): diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index dc0d59e09b9..92e257b71a5 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5531,6 +5531,7 @@ def set_index_from_columns( keys: list[Hashable], drop: Optional[bool] = True, append: Optional[bool] = False, + include_index: Optional[bool] = True, ) -> "SnowflakeQueryCompiler": """ Create or update index (row labels) from a list of columns. @@ -5543,6 +5544,8 @@ def set_index_from_columns( append: bool, default False Whether to add the columns in `keys` as new levels appended to the existing index. + include_index: bool, default True + Whether the keys can also include index column lables as well. Returns: A new QueryCompiler instance with updated index. @@ -5552,7 +5555,7 @@ def set_index_from_columns( for ( ids ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( - keys, include_index=False + keys, include_index=include_index ): # Error checking for missing labels is already done in frontend layer. index_column_snowflake_quoted_identifiers.append(ids[0]) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py index ba729cce6a9..c2519fc76d6 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py @@ -1353,7 +1353,7 @@ def dayofweek(): Examples -------- - >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D') + >>> s = pd.Series(pd.date_range('2016-12-31', '2017-01-08', freq='D')) >>> s 0 2016-12-31 1 2017-01-01 @@ -1390,7 +1390,7 @@ def dayofyear(): Examples -------- - >>> s = pd.to_datetime(["1/1/2020", "2/1/2020"]) + >>> s = pd.Series(pd.to_datetime(["1/1/2020", "2/1/2020"])) >>> s 0 2020-01-01 1 2020-02-01 @@ -1670,9 +1670,8 @@ def is_leap_year(): This method is available on Series with datetime values under the .dt accessor, and directly on DatetimeIndex. >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") - >>> idx # doctest: +SKIP - DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], - dtype='datetime64[ns]', freq='YE-DEC') + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq=None) >>> idx.is_leap_year # doctest: +SKIP array([ True, False, False]) @@ -1688,7 +1687,6 @@ def is_leap_year(): 2 False dtype: bool """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. @property def daysinmonth(): @@ -1762,22 +1760,19 @@ def month_name(): dtype: object >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='ME') + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) >>> idx.month_name() # doctest: +SKIP Index(['January', 'February', 'March'], dtype='object') Using the locale parameter you can set a different locale language, for example: idx.month_name(locale='pt_BR.utf8') will return month names in Brazilian Portuguese language. >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='ME') + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. def day_name(): """ @@ -1808,22 +1803,19 @@ def day_name(): dtype: object >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], - dtype='datetime64[ns]', freq='D') + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) >>> idx.day_name() # doctest: +SKIP Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') Using the locale parameter you can set a different locale language, for example: idx.day_name(locale='pt_BR.utf8') will return day names in Brazilian Portuguese language. >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], - dtype='datetime64[ns]', freq='D') + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Segunda', 'Terça', 'Quarta'], dtype='object') """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. def total_seconds(): pass diff --git a/tests/integ/modin/frame/test_duplicated.py b/tests/integ/modin/frame/test_duplicated.py index d0031adf0c4..e4c5d594ecc 100644 --- a/tests/integ/modin/frame/test_duplicated.py +++ b/tests/integ/modin/frame/test_duplicated.py @@ -93,7 +93,7 @@ def test_duplicated_on_empty_frame(): @sql_count_checker(query_count=3, join_count=2) def test_frame_datetime64_duplicated(): - dates = pd.date_range("2010-07-01", end="2010-08-05") + dates = pd.date_range("2010-07-01", end="2010-08-05").to_series() tst = pd.DataFrame({"symbol": "AAA", "date": dates}) result = tst.duplicated(["date", "symbol"]) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index c6c17313489..c908b56c56a 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -325,7 +325,7 @@ def sample_function( ) @sql_count_checker(query_count=7, fallback_count=1, sproc_count=1) def test_property_methods_telemetry(): - datetime_series = pd.date_range("2000-01-01", periods=3, freq="h") + datetime_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h")) ret_series = datetime_series.dt.timetz assert len(ret_series._query_compiler.snowpark_pandas_api_calls) == 1 api_call = ret_series._query_compiler.snowpark_pandas_api_calls[0]