diff --git a/CHANGELOG.md b/CHANGELOG.md index dc28d76f3b8..4339973da47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -79,6 +79,8 @@ ### Behavior change - `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object. - Refactor and introduce `query_compiler` argument in `Index` constructor to create `Index` from query compiler. +- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object. +- `pd.date_range` now returns a DatetimeIndex object instead of a Series object. ## 1.20.0 (2024-07-17) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 468b31f5367..417d1edca5f 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1572,10 +1572,7 @@ def to_datetime( >>> pd.to_datetime([1, 2, 3], unit='D', ... origin=pd.Timestamp('1960-01-01')) - 0 1960-01-02 - 1 1960-01-03 - 2 1960-01-04 - dtype: datetime64[ns] + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) **Non-convertible date/times** @@ -1589,9 +1586,7 @@ def to_datetime( in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce') - 0 NaT - 1 NaT - dtype: datetime64[ns] + DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: @@ -1603,55 +1598,41 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) - 0 2018-10-26 12:00:00 - 1 2018-10-26 13:00:15 - dtype: datetime64[ns] + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Timezone-aware inputs *with constant time offset* are still converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default. >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500']) - 0 2018-10-26 12:00:00 - 1 2018-10-26 13:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None) - Use right format to convert to timezone-aware type (Note that when call Snowpark pandas API to_pandas() the timezone-aware output will always be converted to session timezone): >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z") - 0 2018-10-26 10:00:00-07:00 - 1 2018-10-26 11:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris): >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100']) - 0 2020-10-25 02:00:00 - 1 2020-10-25 04:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None) >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z") - 0 2020-10-24 17:00:00-07:00 - 1 2020-10-24 20:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) Setting ``utc=True`` makes sure always convert to timezone-aware outputs: - Timezone-naive inputs are *localized* based on the session timezone >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) - 0 2018-10-26 12:00:00-07:00 - 1 2018-10-26 13:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Timezone-aware inputs are *converted* to session timezone >>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'], ... utc=True) - 0 2018-10-26 10:30:00-07:00 - 1 2018-10-26 10:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) """ # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py raise_if_native_pandas_objects(arg) @@ -1668,22 +1649,14 @@ def to_datetime( message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied", ) arg_is_scalar = is_scalar(arg) - # handle empty array, list, dict - if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0: - return arg if isinstance(arg, Series) else Series(arg) # always return a Series - if not isinstance(arg, (DataFrame, Series)): - # turn dictionary like arg into DataFrame and list like or scalar to Series - if isinstance(arg, dict): - arg = DataFrame(arg) # pragma: no cover - else: - name = None - # keep index name - if isinstance(arg, pd.Index): - name = arg.name - arg = Series(arg) - arg.name = name - - series = arg._to_datetime( + + if not isinstance(arg, (DataFrame, Series, pd.Index)): + # Turn dictionary like arg into pd.DataFrame and list-like or scalar to + # pd.Index. + arg = [arg] if arg_is_scalar else arg + arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg) + + series_or_index = arg._to_datetime( errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, @@ -1697,9 +1670,10 @@ def to_datetime( if arg_is_scalar: # Calling squeeze directly on Snowpark pandas Series makes an unnecessary # count sql call. To avoid that we convert Snowpark pandas Series to Native - # pandas seris first. - return series.to_pandas().squeeze() - return series + # pandas series first. + # Note: When arg_is_scalar is True 'series_or_index' is always an Index. + return series_or_index.to_series().to_pandas().squeeze() + return series_or_index @snowpark_pandas_telemetry_standalone_function_decorator @@ -2004,9 +1978,9 @@ def date_range( name: Hashable | None = None, inclusive: IntervalClosedType = "both", **kwargs, -) -> Series: +) -> pd.DatetimeIndex: """ - Return a fixed frequency series. + Return a fixed frequency DatetimeIndex. Returns the range of equally spaced time points (where the difference between any two adjacent points is specified by the given frequency) such that they all @@ -2078,109 +2052,72 @@ def date_range( Specify `start` and `end`, with the default daily frequency. >>> pd.date_range(start='1/1/2018', end='1/08/2018') - 0 2018-01-01 - 1 2018-01-02 - 2 2018-01-03 - 3 2018-01-04 - 4 2018-01-05 - 5 2018-01-06 - 6 2018-01-07 - 7 2018-01-08 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq=None) Specify `start` and `periods`, the number of periods (days). >>> pd.date_range(start='1/1/2018', periods=8) - 0 2018-01-01 - 1 2018-01-02 - 2 2018-01-03 - 3 2018-01-04 - 4 2018-01-05 - 5 2018-01-06 - 6 2018-01-07 - 7 2018-01-08 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq=None) Specify `end` and `periods`, the number of periods (days). >>> pd.date_range(end='1/1/2018', periods=8) - 0 2017-12-25 - 1 2017-12-26 - 2 2017-12-27 - 3 2017-12-28 - 4 2017-12-29 - 5 2017-12-30 - 6 2017-12-31 - 7 2018-01-01 - dtype: datetime64[ns] + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq=None) Specify `start`, `end`, and `periods`; the frequency is generated automatically (linearly spaced). >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) - 0 2018-04-24 00:00:00 - 1 2018-04-25 12:00:00 - 2 2018-04-27 00:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) + **Other Parameters** Changed the `freq` (frequency) to ``'ME'`` (month end frequency). >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') - 0 2018-01-31 - 1 2018-02-28 - 2 2018-03-31 - 3 2018-04-30 - 4 2018-05-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq=None) Multiples are allowed >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') - 0 2018-01-31 - 1 2018-04-30 - 2 2018-07-31 - 3 2018-10-31 - 4 2019-01-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq=None) `freq` can also be specified as an Offset object. >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) - 0 2018-01-31 - 1 2018-04-30 - 2 2018-07-31 - 3 2018-10-31 - 4 2019-01-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq=None) `inclusive` controls whether to include `start` and `end` that are on the boundary. The default, "both", includes boundary points on either end. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both") - 0 2017-01-01 - 1 2017-01-02 - 2 2017-01-03 - 3 2017-01-04 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None) Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left') - 0 2017-01-01 - 1 2017-01-02 - 2 2017-01-03 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None) Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and similarly ``inclusive='neither'`` will exclude both `start` and `end`. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') - 0 2017-01-02 - 1 2017-01-03 - 2 2017-01-04 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None) """ # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py @@ -2229,9 +2166,11 @@ def date_range( left_inclusive=left_inclusive, right_inclusive=right_inclusive, ) - s = Series(query_compiler=qc) - s.name = name - return s + # Set date range as index column. + qc = qc.set_index_from_columns(qc.columns.tolist()) + # Set index column name. + qc = qc.set_index_names([name]) + return pd.DatetimeIndex(data=qc) @snowpark_pandas_telemetry_standalone_function_decorator diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 22ac143dfe2..17fcf35f6d7 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5551,7 +5551,7 @@ def set_index_from_columns( for ( ids ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( - keys + keys, include_index=False ): # Error checking for missing labels is already done in frontend layer. index_column_snowflake_quoted_identifiers.append(ids[0]) @@ -5870,6 +5870,7 @@ def series_to_datetime( unit: Optional[str] = None, infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default, origin: DateTimeOrigin = "unix", + include_index: bool = False, ) -> "SnowflakeQueryCompiler": """ Convert series to the datetime dtype. @@ -5884,6 +5885,7 @@ def series_to_datetime( unit: to_datetime unit infer_datetime_format: to_datetime infer_datetime_format origin: to_datetime origin + include_index: If True, also convert index columns to datetime. Returns: SnowflakeQueryCompiler: QueryCompiler with a single data column converted to datetime dtype. @@ -5896,12 +5898,16 @@ def series_to_datetime( to_snowflake_timestamp_format(format) if format is not None else None ) id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type() - col_id = self._modin_frame.data_column_snowflake_quoted_identifiers[0] - sf_type = id_to_sf_type_map[col_id] + col_ids = [] + if include_index: + col_ids = self._modin_frame.index_column_snowflake_quoted_identifiers + col_ids.extend(self._modin_frame.data_column_snowflake_quoted_identifiers) - if isinstance(sf_type, BooleanType): - # bool is not allowed in to_datetime (but note that bool is allowed by astype) - raise TypeError("dtype bool cannot be converted to datetime64[ns]") + for col_id in col_ids: + sf_type = id_to_sf_type_map[col_id] + if isinstance(sf_type, BooleanType): + # bool is not allowed in to_datetime (but note that bool is allowed by astype) + raise TypeError("dtype bool cannot be converted to datetime64[ns]") to_datetime_cols = { col_id: generate_timestamp_col( @@ -5913,6 +5919,7 @@ def series_to_datetime( unit="ns" if unit is None else unit, origin=origin, ) + for col_id in col_ids } return SnowflakeQueryCompiler( self._modin_frame.update_snowflake_quoted_identifiers_with_expressions( diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index ef94d7bb7ea..e11ac325f0d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -30,7 +30,7 @@ import numpy as np import pandas as native_pd from pandas._libs import lib -from pandas._typing import ArrayLike, DtypeObj, NaPosition +from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_datetime64_any_dtype, pandas_dtype @@ -39,6 +39,7 @@ from snowflake.snowpark.modin.pandas.base import BasePandasDataset from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta +from snowflake.snowpark.modin.plugin._internal.timestamp_utils import DateTimeOrigin from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -2368,3 +2369,64 @@ def str(self) -> native_pd.core.strings.accessor.StringMethods: """ WarningMessage.index_to_pandas_warning("str") return self.to_pandas().str + + def _to_datetime( + self, + errors: DateTimeErrorChoices = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: bool = False, + format: str = None, + exact: bool | lib.NoDefault = lib.no_default, + unit: str = None, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + origin: DateTimeOrigin = "unix", + ) -> Index: + """ + Args: + errors: {'ignore', 'raise', 'coerce'}, default 'raise' + If 'raise', then invalid parsing will raise an exception. + If 'coerce', then invalid parsing will be set as NaT. + If 'ignore', then invalid parsing will return the input. + dayfirst: bool, default False + Specify a date parse order if arg is str or is list-like. + yearfirst: bool, default False + Specify a date parse order if arg is str or is list-like. + utc: bool, default False + Control timezone-related parsing, localization and conversion. + format: str, default None + The strftime to parse time + exact: bool, default True + Control how format is used: + True: require an exact format match. + False: allow the format to match anywhere in the target string. + unit: str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer + or float number. + infer_datetime_format: bool, default False + If True and no format is given, attempt to infer the format of the \ + datetime strings based on the first non-NaN element. + origin: scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by unit) since this reference date. + + Returns: + DatetimeIndex + """ + from snowflake.snowpark.modin.plugin.extensions.datetime_index import ( + DatetimeIndex, + ) + + new_qc = self._query_compiler.series_to_datetime( + errors, + dayfirst, + yearfirst, + utc, + format, + exact, + unit, + infer_datetime_format, + origin, + include_index=True, + ) + return DatetimeIndex(data=new_qc) diff --git a/tests/integ/modin/tools/test_date_range.py b/tests/integ/modin/tools/test_date_range.py index a1f3c4d0ba2..d26861442f4 100644 --- a/tests/integ/modin/tools/test_date_range.py +++ b/tests/integ/modin/tools/test_date_range.py @@ -63,7 +63,7 @@ @sql_count_checker(query_count=1) def test_regular_range(kwargs): assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -125,7 +125,7 @@ def test_regular_range(kwargs): @sql_count_checker(query_count=1) def test_irregular_range(kwargs): assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -168,7 +168,7 @@ def test_without_freq(periods, inclusive): "inclusive": inclusive, } assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -193,7 +193,7 @@ def test_without_freq(periods, inclusive): def test_inclusive(kwargs, inclusive): kwargs.update({"inclusive": inclusive}) assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index d08495b31e9..07fb4aefebf 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -26,6 +26,7 @@ ) from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( + assert_index_equal, assert_series_equal, assert_snowpark_pandas_equal_to_pandas, eval_snowpark_pandas_result, @@ -58,10 +59,10 @@ def test_to_datetime_readonly(self, readonly): if readonly: arr.setflags(write=False) result = to_datetime(arr) - expected = Series([], dtype=object) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex([]) + assert_index_equal(result, expected) - @pytest.mark.parametrize("box", [Series, native_pd.Index]) + @pytest.mark.parametrize("box", [Series, pd.Index]) @pytest.mark.parametrize( "format, expected", [ @@ -79,11 +80,18 @@ def test_to_datetime_readonly(self, readonly): def test_to_datetime_format(self, cache, box, format, expected): values = box(["1/1/2000", "1/2/2000", "1/3/2000"]) result = to_datetime(values, format=format, cache=cache) - expected = Series(expected) - assert_series_equal(result, expected) + expected = box(expected) + if box is Series: + assert_series_equal(result, expected) + else: + assert_index_equal(result, expected) + # cache values is ignored at Snowpark pandas so only test here to make sure it works as well result = to_datetime(values, format=format, cache=False) - assert_series_equal(result, expected) + if box is Series: + assert_series_equal(result, expected) + else: + assert_index_equal(result, expected) @pytest.mark.parametrize( "arg, expected, format", @@ -235,9 +243,9 @@ def test_to_datetime_format_YYYYMMDD_with_none(self, input_s): # GH 30011 # format='yyyymmdd' # with None - expected = Series([Timestamp("19801222"), Timestamp("20010112"), NaT]) - result = Series(to_datetime(input_s, format="%Y%m%d")) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex([Timestamp("19801222"), Timestamp("20010112"), NaT]) + result = to_datetime(input_s, format="%Y%m%d") + assert_index_equal(result, expected) @pytest.mark.parametrize( "input, expected", @@ -295,7 +303,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected): def test_to_datetime_with_NA(self, data, format, expected): # GH#42957 result = to_datetime(data, format=format) - assert_series_equal(result, Series(expected)) + assert_index_equal(result, pd.DatetimeIndex(expected)) @sql_count_checker(query_count=1, udf_count=0) def test_to_datetime_format_integer_year_only(self, cache): @@ -509,10 +517,10 @@ def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): fmt = "%Y-%m-%d %H:%M:%S %z" result = to_datetime(dates, format=fmt, utc=True) - expected = Series(expected_dates) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex(expected_dates) + assert_index_equal(result, expected) result2 = to_datetime(dates, utc=True) - assert_series_equal(result2, expected) + assert_index_equal(result2, expected) @pytest.mark.parametrize( "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] @@ -529,8 +537,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset): ): to_datetime([date], format=fmt).to_pandas() - # 2 extra queries to convert index to series - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=0) def test_to_datetime_parse_timezone_keeps_name(self): # GH 21697 fmt = "%Y-%m-%d %H:%M:%S %z" @@ -547,14 +554,14 @@ def test_to_datetime_mixed_datetime_and_string(self): res = to_datetime(["2020-01-01 17:00:00 -0100", d2]) # The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the # result ignores the timezone by default - expected = native_pd.Series( + expected = native_pd.DatetimeIndex( [datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)] ) - assert_series_equal(res, expected, check_dtype=False, check_index_type=False) + assert_index_equal(res, expected) # Set utc=True to make sure timezone aware in to_datetime res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True) - expected = pd.Series([d1, d2]) - assert_series_equal(res, expected, check_dtype=False, check_index_type=False) + expected = pd.DatetimeIndex([d1, d2]) + assert_index_equal(res, expected) @pytest.mark.parametrize( "tz", @@ -563,13 +570,14 @@ def test_to_datetime_mixed_datetime_and_string(self): pytest.param("US/Central"), ], ) - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=3) def test_to_datetime_dtarr(self, tz): # DatetimeArray dti = native_pd.date_range("1965-04-03", periods=19, freq="2W", tz=tz) arr = DatetimeArray(dti) + # Use assert_series_equal to ignore timezone difference in dtype. assert_series_equal( - to_datetime(arr), + Series(to_datetime(arr)), Series(arr), check_dtype=False, ) diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index 6ec630024a3..06728f9e985 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -271,7 +271,8 @@ def assert_snowpark_pandas_equal_to_pandas( tm.assert_series_equal(snow_to_native, expected_pandas, **kwargs) else: assert isinstance(snow, Index) - kwargs.pop("check_dtype") + if "check_dtype" in kwargs: + kwargs.pop("check_dtype") if kwargs.pop("check_index_type"): kwargs.update(exact=False) tm.assert_index_equal(snow_to_native, expected_pandas, **kwargs)