From 82eec96b1587f2639e67762c965a5faff16427c0 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Thu, 12 Sep 2024 10:48:38 -0700 Subject: [PATCH 1/8] temp --- .../modin/plugin/extensions/datetime_index.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index df136af1a34..e02e02810c8 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -26,7 +26,7 @@ from __future__ import annotations -from datetime import tzinfo +from datetime import timedelta, tzinfo import modin import numpy as np @@ -1489,7 +1489,6 @@ def to_pydatetime(self) -> np.ndarray: datetime.datetime(2018, 3, 1, 0, 0)], dtype=object) """ - @datetime_index_not_implemented() def mean( self, *, skipna: bool = True, axis: AxisInt | None = 0 ) -> native_pd.Timestamp: @@ -1520,11 +1519,16 @@ def mean( >>> idx = pd.date_range('2001-01-01 00:00', periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.mean() # doctest: +SKIP + >>> idx.mean() Timestamp('2001-01-02 00:00:00') """ + return ( + self.to_series() + .agg("mean", axis=axis, skipna=skipna) + .to_pandas() + .squeeze(axis=1) + ) - @datetime_index_not_implemented() def std( self, axis=None, @@ -1533,7 +1537,7 @@ def std( ddof: int = 1, keepdims: bool = False, skipna: bool = True, - ): + ) -> timedelta: """ Return sample standard deviation over requested axis. @@ -1568,6 +1572,16 @@ def std( >>> idx = pd.date_range('2001-01-01 00:00', periods=3) >>> idx DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03'], dtype='datetime64[ns]', freq=None) - >>> idx.std() # doctest: +SKIP + >>> idx.std() Timedelta('1 days 00:00:00') """ + kwargs = { + "dtype": dtype, + "out": out, + "ddof": ddof, + "keepdims": keepdims, + "skipna": skipna, + } + return ( + self.to_series().agg("std", axis=axis, **kwargs).to_pandas().squeeze(axis=1) + ) From c592a0f43d696d515fd3607fa60a2e767c4b19f0 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 13 Sep 2024 14:49:36 -0700 Subject: [PATCH 2/8] support datetimeindex.std and mean --- CHANGELOG.md | 1 + .../supported/datetime_index_supported.rst | 4 +- .../modin/plugin/extensions/datetime_index.py | 51 ++++++---- .../index/test_datetime_index_methods.py | 94 +++++++++++++++++++ 4 files changed, 128 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0589d4a358..c3da8d2dd0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ #### New Features - Added support for `TimedeltaIndex.mean` method. +- Added support for `DatetimeIndex.mean` and `DatetimeIndex.std` methods. ## 1.22.1 (2024-09-11) diff --git a/docs/source/modin/supported/datetime_index_supported.rst b/docs/source/modin/supported/datetime_index_supported.rst index 68b1935da96..325da109877 100644 --- a/docs/source/modin/supported/datetime_index_supported.rst +++ b/docs/source/modin/supported/datetime_index_supported.rst @@ -100,7 +100,7 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``day_name`` | P | ``locale`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``mean`` | N | | | +| ``mean`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``std`` | N | | | +| ``std`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index e02e02810c8..2ad902e8a4e 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -43,6 +43,7 @@ ) from pandas.core.dtypes.common import is_datetime64_any_dtype +from snowflake.snowpark.modin.pandas import to_datetime, to_timedelta from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -1500,6 +1501,8 @@ def mean( skipna : bool, default True Whether to ignore any NaT elements. axis : int, optional, default 0 + The axis to calculate the mean over. + This parameter is ignored - 0 is the only valid axis. Returns ------- @@ -1522,21 +1525,22 @@ def mean( >>> idx.mean() Timestamp('2001-01-02 00:00:00') """ - return ( - self.to_series() - .agg("mean", axis=axis, skipna=skipna) - .to_pandas() - .squeeze(axis=1) + # Need to convert timestamp to int value (nanoseconds) before aggregating. + # TODO: SNOW-1625233 When `tz` is supported, add a `tz` parameter to `to_datetime` for correct timezone result. + if axis not in [None, 0]: + raise ValueError( + f"axis={axis} is not supported, this parameter is ignored. 0 is the only valid axis." + ) + return to_datetime( + self.to_series().astype("int64").agg("mean", axis=0, skipna=skipna) ) def std( self, - axis=None, - dtype=None, - out=None, + axis: AxisInt | None = None, ddof: int = 1, - keepdims: bool = False, skipna: bool = True, + **kwargs, ) -> timedelta: """ Return sample standard deviation over requested axis. @@ -1546,11 +1550,12 @@ def std( Parameters ---------- axis : int, optional - Axis for the function to be applied on. For :class:`pandas.Series` - this parameter is unused and defaults to ``None``. + The axis to calculate the standard deviation over. + This parameter is ignored - 0 is the only valid axis. ddof : int, default 1 Degrees of Freedom. The divisor used in calculations is `N - ddof`, where `N` represents the number of elements. + This parameter is not yet supported. skipna : bool, default True Exclude NA/null values. If an entire row/column is ``NA``, the result will be ``NA``. @@ -1575,13 +1580,19 @@ def std( >>> idx.std() Timedelta('1 days 00:00:00') """ - kwargs = { - "dtype": dtype, - "out": out, - "ddof": ddof, - "keepdims": keepdims, - "skipna": skipna, - } - return ( - self.to_series().agg("std", axis=axis, **kwargs).to_pandas().squeeze(axis=1) + if axis not in [None, 0]: + raise ValueError( + f"axis={axis} is not supported, this parameter is ignored. 0 is the only valid axis." + ) + if ddof != 1: + raise NotImplementedError( + "`ddof` parameter is not yet supported for `std`." + ) + # Need to convert timestamp to a float type to prevent overflow when aggregating. + # Cannot directly convert a timestamp to a float; therefore, first convert it to an int then a float. + return to_timedelta( + self.to_series() + .astype(int) + .astype(float) + .agg("std", axis=0, ddof=ddof, skipna=skipna, **kwargs) ) diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index 143e1d74080..a01e740ee84 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -294,3 +294,97 @@ def test_floor_ceil_round_negative(func, freq, ambiguous, nonexistent): getattr(snow_index, func)( freq=freq, ambiguous=ambiguous, nonexistent=nonexistent ) + + +@pytest.mark.parametrize( + "native_index", + [ + native_pd.date_range("2021-01-01", periods=5), + native_pd.date_range("2021-01-01", periods=5, freq="2D"), + pytest.param( + native_pd.DatetimeIndex( + [ + "2014-04-04 23:56:01.000000001", + "2014-07-18 21:24:02.000000002", + "2015-11-22 22:14:03.000000003", + "2015-11-23 20:12:04.1234567890", + pd.NaT, + ], + tz="US/Eastern", + ), + marks=pytest.mark.xfail(reason="Snowpark pandas does not support timezone"), + ), + native_pd.DatetimeIndex( + [ + "2014-04-04 23:56", + pd.NaT, + "2014-07-18 21:24", + "2015-11-22 22:14", + pd.NaT, + ] + ), + ], +) +@pytest.mark.parametrize("skipna", [True, False]) +@sql_count_checker(query_count=1) +def test_datetime_index_mean(native_index, skipna): + snow_index = pd.DatetimeIndex(native_index) + native_res = native_index.mean(skipna=skipna) + snow_res = snow_index.mean(skipna=skipna) + if native_res is pd.NaT: + assert snow_res is pd.NaT + else: + assert snow_res == native_res + + +@pytest.mark.parametrize( + "native_index", + [ + native_pd.date_range("2021-01-01", periods=5), + native_pd.date_range("2021-01-01", periods=5, freq="2D"), + # TODO: SNOW-1625233 Remove xfail when timezone is supported. + pytest.param( + native_pd.DatetimeIndex( + [ + "2014-04-04 23:56:01.000000001", + "2014-07-18 21:24:02.000000002", + "2015-11-22 22:14:03.000000003", + "2015-11-23 20:12:04.1234567890", + pd.NaT, + ], + tz="US/Eastern", + ), + marks=pytest.mark.xfail(reason="Snowpark pandas does not support timezone"), + ), + native_pd.DatetimeIndex( + [ + "2014-04-04 23:56", + pd.NaT, + "2014-07-18 21:24", + "2015-11-22 22:14", + pd.NaT, + ] + ), + ], +) +@pytest.mark.parametrize("ddof", [1]) +@pytest.mark.parametrize("skipna", [True, False]) +@sql_count_checker(query_count=1) +def test_datetime_index_std(native_index, ddof, skipna): + snow_index = pd.DatetimeIndex(native_index) + native_res = native_index.std(ddof=ddof, skipna=skipna) + snow_res = snow_index.std(ddof=ddof, skipna=skipna) + # Since the Snowpark pandas implementation converts timestamp values to float values, + # there is some loss in accuracy. Hence, we use approx to compare the results. + pytest.approx(snow_res, native_res, nan_ok=True) + + +@pytest.mark.parametrize("ops", ["mean", "std"]) +@sql_count_checker(query_count=0) +def test_datetime_index_agg_ops_axis_negative(ops): + snow_index = pd.DatetimeIndex(["2021-01-01", "2021-01-02", "2021-01-03"]) + msg = ( + "axis=1 is not supported, this parameter is ignored. 0 is the only valid axis." + ) + with pytest.raises(ValueError, match=msg): + getattr(snow_index, ops)(axis=1) From a2696a5c681e3c701d1d1d67d472d07a03961bca Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 13 Sep 2024 17:55:48 -0700 Subject: [PATCH 3/8] fix changedoc, add ddof test --- .../supported/datetime_index_supported.rst | 2 +- .../index/test_datetime_index_methods.py | 21 ++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/source/modin/supported/datetime_index_supported.rst b/docs/source/modin/supported/datetime_index_supported.rst index 46cee7f6014..9ebf6935f77 100644 --- a/docs/source/modin/supported/datetime_index_supported.rst +++ b/docs/source/modin/supported/datetime_index_supported.rst @@ -102,5 +102,5 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``mean`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``std`` | Y | | | +| ``std`` | P | ``ddof`` | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index 094ffd1280c..29932edc1eb 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -465,7 +465,9 @@ def test_datetime_index_mean(native_index, skipna): ], tz="US/Eastern", ), - marks=pytest.mark.xfail(reason="Snowpark pandas does not support timezone"), + marks=pytest.mark.xfail( + reason="SNOW-1664175 Snowpark pandas `to_datetime` does not support tz" + ), ), native_pd.DatetimeIndex( [ @@ -494,8 +496,17 @@ def test_datetime_index_std(native_index, ddof, skipna): @sql_count_checker(query_count=0) def test_datetime_index_agg_ops_axis_negative(ops): snow_index = pd.DatetimeIndex(["2021-01-01", "2021-01-02", "2021-01-03"]) - msg = ( - "axis=1 is not supported, this parameter is ignored. 0 is the only valid axis." - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, + match="axis=1 is not supported, this parameter is ignored. 0 is the only valid axis.", + ): getattr(snow_index, ops)(axis=1) + + +@sql_count_checker(query_count=0) +def test_datetime_index_std_ddof_negative(): + snow_index = pd.DatetimeIndex(["2021-01-01", "2021-01-02", "2021-01-03"]) + with pytest.raises( + NotImplementedError, match="`ddof` parameter is not yet supported for `std`." + ): + snow_index.std(ddof=2) From 867b98cc8d9672c1cc9f758d89f5a7a1d2421419 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 16 Sep 2024 10:53:37 -0700 Subject: [PATCH 4/8] Update tests/integ/modin/index/test_datetime_index_methods.py Co-authored-by: Andong Zhan --- tests/integ/modin/index/test_datetime_index_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index 29932edc1eb..621f3ee49c1 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -423,7 +423,7 @@ def test_floor_ceil_round_negative(func, freq, ambiguous, nonexistent): ], tz="US/Eastern", ), - marks=pytest.mark.xfail(reason="Snowpark pandas does not support timezone"), + marks=pytest.mark.xfail(reason="TODO: SNOW-1625233 Snowpark pandas to_datetime does not support timezone"), ), native_pd.DatetimeIndex( [ From 58778b3e623da121072827952a0d4b39953b309e Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 16 Sep 2024 16:29:01 -0700 Subject: [PATCH 5/8] fix lint --- tests/integ/modin/index/test_datetime_index_methods.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index 621f3ee49c1..a7d32a45240 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -423,7 +423,9 @@ def test_floor_ceil_round_negative(func, freq, ambiguous, nonexistent): ], tz="US/Eastern", ), - marks=pytest.mark.xfail(reason="TODO: SNOW-1625233 Snowpark pandas to_datetime does not support timezone"), + marks=pytest.mark.xfail( + reason="TODO: SNOW-1625233 Snowpark pandas to_datetime does not support timezone" + ), ), native_pd.DatetimeIndex( [ From 361d3fcad8edea8c841547b0828f10ac4301ed71 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 10:58:52 -0700 Subject: [PATCH 6/8] fix inaccuracy issue by converting timestamp to seconds in std --- .../snowpark/modin/plugin/extensions/datetime_index.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index d8982f11c97..3cfbb62f679 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -1601,11 +1601,11 @@ def std( raise NotImplementedError( "`ddof` parameter is not yet supported for `std`." ) - # Need to convert timestamp to a float type to prevent overflow when aggregating. + # Need to convert timestamp to seconds to prevent overflow when aggregating. # Cannot directly convert a timestamp to a float; therefore, first convert it to an int then a float. return to_timedelta( - self.to_series() - .astype(int) - .astype(float) - .agg("std", axis=0, ddof=ddof, skipna=skipna, **kwargs) + (self.to_series().astype(int) / 1e9).agg( + "std", axis=0, ddof=ddof, skipna=skipna, **kwargs + ) + * 1e9 ) From 7c4b6658faa6dbfc7d5954bc6dc35fdde744ccb0 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 12:16:20 -0700 Subject: [PATCH 7/8] add better description and use int 1e9 instead of float --- .../modin/plugin/extensions/datetime_index.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index 3cfbb62f679..0c38b8c5bef 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -1601,11 +1601,15 @@ def std( raise NotImplementedError( "`ddof` parameter is not yet supported for `std`." ) - # Need to convert timestamp to seconds to prevent overflow when aggregating. - # Cannot directly convert a timestamp to a float; therefore, first convert it to an int then a float. + # Snowflake cannot directly perform `std` on a timestamp; therefore, convert the timestamp to an integer. + # By default, the integer version of a timestamp is in nanoseconds. Directly performing computations with + # nanoseconds can lead to results with integer size much larger than the original integer size. Therefore, + # convert the nanoseconds to seconds and then compute the standard deviation. + # The timestamp is converted to seconds instead of the float version of nanoseconds since that can lead to + # floating point precision issues return to_timedelta( - (self.to_series().astype(int) / 1e9).agg( + (self.to_series().astype(int) / 1_000_000_000).agg( "std", axis=0, ddof=ddof, skipna=skipna, **kwargs ) - * 1e9 + * 1_000_000_000 ) From 0b973345dd8a3cb7db6afed771c807036adf3aaa Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 12:19:49 -0700 Subject: [PATCH 8/8] Update src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py Co-authored-by: Andong Zhan --- .../snowpark/modin/plugin/extensions/datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py index 0c38b8c5bef..16c6ebdc1d0 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/datetime_index.py @@ -1608,7 +1608,7 @@ def std( # The timestamp is converted to seconds instead of the float version of nanoseconds since that can lead to # floating point precision issues return to_timedelta( - (self.to_series().astype(int) / 1_000_000_000).agg( + (self.to_series().astype(int) // 1_000_000_000).agg( "std", axis=0, ddof=ddof, skipna=skipna, **kwargs ) * 1_000_000_000