From ce50b6f73a504b76802241deb817751fcdc3516a Mon Sep 17 00:00:00 2001 From: azhan Date: Wed, 28 Aug 2024 15:48:51 -0700 Subject: [PATCH] save --- CHANGELOG.md | 1 + .../snowpark/modin/pandas/general.py | 55 ++++++++++--------- .../modin/plugin/_internal/timestamp_utils.py | 2 - tests/integ/modin/tools/test_to_datetime.py | 45 ++++++++------- 4 files changed, 55 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad1c8e9cb95..b8027c52475 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -76,6 +76,7 @@ #### Improvements - Refactored `quoted_identifier_to_snowflake_type` to avoid making metadata queries if the types have been cached locally. +- Improved `pd.to_datetime` to handle all local input cases. #### Bug Fixes diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 3a9198a5fea..8decfcd8a91 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1741,16 +1741,13 @@ def to_datetime( The default behaviour (``utc=False``) is as follows: - - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`: + - Timezone-naive inputs are kept as timezone-naive :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) + >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - - Timezone-aware inputs *with constant time offset* are still converted to - timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default. - >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500']) - DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Use right format to convert to timezone-aware type (Note that when call Snowpark pandas API to_pandas() the timezone-aware output will always be converted to session timezone): @@ -1762,17 +1759,17 @@ def to_datetime( issued from a timezone with daylight savings, such as Europe/Paris): >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100']) - DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None) + Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]') >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z") - DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) + Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]') Setting ``utc=True`` makes sure always convert to timezone-aware outputs: - Timezone-naive inputs are *localized* based on the session timezone >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) - DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) + DatetimeIndex(['2018-10-26 05:00:00-07:00', '2018-10-26 06:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Timezone-aware inputs are *converted* to session timezone @@ -1783,8 +1780,28 @@ def to_datetime( # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py raise_if_native_pandas_objects(arg) - if arg is None: - return None # same as pandas + if not isinstance(arg, (DataFrame, Series, pd.Index)): + # use pandas.to_datetime to convert local data to datetime + res = pandas.to_datetime( + arg, + errors, + dayfirst, + yearfirst, + utc, + format, + exact, + unit, + infer_datetime_format, + origin, + cache, + ) + if isinstance(res, pandas.Series): + res = pd.Series(res) + elif not is_scalar(res): + res = pd.Index(res) + return res + + # handle modin objs if unit and unit not in VALID_TO_DATETIME_UNIT: raise ValueError(f"Unrecognized unit {unit}") @@ -1794,15 +1811,8 @@ def to_datetime( argument="cache", message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied", ) - arg_is_scalar = is_scalar(arg) - - if not isinstance(arg, (DataFrame, Series, pd.Index)): - # Turn dictionary like arg into pd.DataFrame and list-like or scalar to - # pd.Index. - arg = [arg] if arg_is_scalar else arg - arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg) - series_or_index = arg._to_datetime( + return arg._to_datetime( errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, @@ -1813,13 +1823,6 @@ def to_datetime( infer_datetime_format=infer_datetime_format, origin=origin, ) - if arg_is_scalar: - # Calling squeeze directly on Snowpark pandas Series makes an unnecessary - # count sql call. To avoid that we convert Snowpark pandas Series to Native - # pandas series first. - # Note: When arg_is_scalar is True 'series_or_index' is always an Index. - return series_or_index.to_series().to_pandas().squeeze() - return series_or_index @snowpark_pandas_telemetry_standalone_function_decorator diff --git a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py index e0acfaa59e6..25e397bfd84 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/timestamp_utils.py @@ -26,7 +26,6 @@ ) from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage -from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from snowflake.snowpark.types import ( BooleanType, DataType, @@ -35,7 +34,6 @@ StringType, TimestampTimeZone, TimestampType, - VariantType, _FractionalType, ) diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index a0ac55958a9..1ea3445d15a 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -104,7 +104,7 @@ def test_to_datetime_format(self, cache, box, format, expected): ["1/3/2000", "20000103", "%m/%d/%Y"], ], ) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=0) def test_to_datetime_format_scalar(self, cache, arg, expected, format): result = to_datetime(arg, format=format, cache=cache) expected = Timestamp(expected) @@ -120,7 +120,7 @@ def test_to_datetime_format_scalar(self, cache, arg, expected, format): def test_to_datetime_format_unimplemented(self, cache, arg, format): with pytest.raises(NotImplementedError): assert to_datetime( - arg, format=format, cache=cache + pd.Index([arg]), format=format, cache=cache ) == native_pd.to_datetime(arg, format=format, cache=cache) @pytest.mark.parametrize( @@ -135,7 +135,7 @@ def test_to_datetime_format_not_match(self, cache, arg, format): SnowparkSQLException, match=f"Can't parse '{arg}' as timestamp with format 'DD/MM/YYYY'", ): - to_datetime(arg, format=format, cache=cache) + to_datetime(pd.Index([arg]), format=format, cache=cache).to_pandas() @sql_count_checker(query_count=2, udf_count=0) def test_to_datetime_format_YYYYMMDD(self, cache): @@ -302,7 +302,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected): @sql_count_checker(query_count=2) def test_to_datetime_with_NA(self, data, format, expected): # GH#42957 - result = to_datetime(data, format=format) + result = to_datetime(pd.Index(data), format=format) assert_index_equal(result, pd.DatetimeIndex(expected)) @sql_count_checker(query_count=1, udf_count=0) @@ -328,7 +328,7 @@ def test_to_datetime_format_integer_year_month(self, cache): result = to_datetime(ser, format="%Y%m", cache=cache) assert_series_equal(result, expected, check_index_type=False) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=0) def test_to_datetime_format_microsecond(self, cache): month_abbr = calendar.month_abbr[4] val = f"01-{month_abbr}-2011 00:00:01.978" @@ -384,7 +384,9 @@ def test_to_datetime_format_microsecond(self, cache): ) @sql_count_checker(query_count=1) def test_to_datetime_format_time(self, cache, value, format, dt): - assert to_datetime(value, format=format, cache=cache) == dt + assert ( + to_datetime(pd.Index([value]), format=format, cache=cache).to_pandas() == dt + ) @sql_count_checker(query_count=0) def test_to_datetime_with_non_exact_unimplemented(self, cache): @@ -407,9 +409,9 @@ def test_to_datetime_with_non_exact_unimplemented(self, cache): "2012-01-01 09:00:00.001000000", ], ) - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=1, join_count=1) def test_parse_nanoseconds_with_formula(self, cache, arg): - + arg = pd.Index([arg]) # GH8989 # truncating the nanoseconds when a format was provided expected = to_datetime(arg, cache=cache) @@ -426,7 +428,10 @@ def test_parse_nanoseconds_with_formula(self, cache, arg): @sql_count_checker(query_count=0) def test_to_datetime_format_weeks(self, value, fmt, expected, cache): with pytest.raises(NotImplementedError): - assert to_datetime(value, format=fmt, cache=cache) == expected + assert ( + to_datetime(pd.Index([value]), format=fmt, cache=cache).to_pandas()[0] + == expected + ) @pytest.mark.parametrize( "fmt,dates,expected_dates", @@ -497,7 +502,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_fallback( ): # GH 13486 with pytest.raises(NotImplementedError): - to_datetime(dates, format=fmt).to_list() + to_datetime(pd.Index(dates), format=fmt).to_list() @sql_count_checker(query_count=4) def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): @@ -535,7 +540,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset): SnowparkSQLException, match="Can't parse|as timestamp with format 'YYYY-MM-DD HH24:MI:SS TZHTZM'", ): - to_datetime([date], format=fmt).to_pandas() + to_datetime(pd.Index([date]), format=fmt).to_pandas() @sql_count_checker(query_count=0) def test_to_datetime_parse_timezone_keeps_name(self): @@ -551,7 +556,7 @@ class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - res = to_datetime(["2020-01-01 17:00:00 -0100", d2]) + res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2])) # The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the # result ignores the timezone by default expected = native_pd.DatetimeIndex( @@ -559,7 +564,7 @@ def test_to_datetime_mixed_datetime_and_string(self): ) assert_index_equal(res, expected) # Set utc=True to make sure timezone aware in to_datetime - res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True) + res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]), utc=True) expected = pd.DatetimeIndex([d1, d2]) assert_index_equal(res, expected) @@ -584,15 +589,15 @@ def test_to_datetime_dtarr(self, tz): @sql_count_checker(query_count=1) def test_to_datetime_pydatetime(self): - actual = to_datetime(datetime(2008, 1, 15)) + actual = to_datetime(pd.Index([datetime(2008, 1, 15)])) assert actual == np.datetime64(datetime(2008, 1, 15)) @pytest.mark.parametrize( "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] ) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=1, join_count=2) def test_to_datetime_dt64s(self, cache, dt): - assert to_datetime(dt, cache=cache) == Timestamp(dt) + assert to_datetime(pd.Index([dt]), cache=cache)[0] == Timestamp(dt) @pytest.mark.parametrize( "sample", @@ -831,11 +836,11 @@ def test_to_datetime_df_negative(self): {"arg": 1490195805433502912, "unit": "ns"}, ], ) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=1, join_count=2) def test_to_datetime_unit(self, sample): - assert pd.to_datetime( - sample["arg"], unit=sample["unit"] - ) == native_pd.to_datetime(sample["arg"], unit=sample["unit"]) + assert pd.to_datetime(pd.Index([sample["arg"]]), unit=sample["unit"])[ + 0 + ] == native_pd.to_datetime(sample["arg"], unit=sample["unit"]) @sql_count_checker(query_count=0) def test_to_datetime_unit_negative(self):