From 3240e5eb4e0ecc82bafdb383ca618c17d5f98b02 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Wed, 4 Sep 2024 14:26:48 -0700 Subject: [PATCH] SNOW-1637102: Support binary operations between timedelta and number. (#2200) Fixes SNOW-1637102 Signed-off-by: sfc-gh-mvashishtha Co-authored-by: Andong Zhan --- CHANGELOG.md | 1 + .../modin/plugin/_internal/binary_op_utils.py | 177 ++++- .../modin/plugin/_internal/type_utils.py | 7 +- .../compiler/snowflake_query_compiler.py | 6 +- tests/integ/modin/binary/test_timedelta.py | 671 ++++++++++++++++-- 5 files changed, 783 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3460c106bf4..2e3d744eeb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,7 @@ - support indexing with Timedelta data columns. - support for adding or subtracting timestamps and `Timedelta`. - support for binary arithmetic between two `Timedelta` values. + - support for binary arithmetic and comparisons between `Timedelta` values and numeric values. - support for lazy `TimedeltaIndex`. - support for `pd.to_timedelta`. - support for `GroupBy` aggregations `min`, `max`, `mean`, `idxmax`, `idxmin`, `std`, `sum`, `median`, `count`, `any`, `all`, `size`, `nunique`. diff --git a/src/snowflake/snowpark/modin/plugin/_internal/binary_op_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/binary_op_utils.py index 6d79de24ffb..8d69752ea38 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/binary_op_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/binary_op_utils.py @@ -12,6 +12,7 @@ from snowflake.snowpark.column import Column as SnowparkColumn from snowflake.snowpark.functions import ( + ceil, col, concat, dateadd, @@ -30,7 +31,10 @@ SnowparkPandasColumn, TimedeltaType, ) -from snowflake.snowpark.modin.plugin._internal.type_utils import infer_object_type +from snowflake.snowpark.modin.plugin._internal.type_utils import ( + DataTypeGetter, + infer_object_type, +) from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage from snowflake.snowpark.types import ( @@ -41,6 +45,7 @@ TimestampType, _FractionalType, _IntegralType, + _NumericType, ) NAN_COLUMN = pandas_lit("nan").cast("float") @@ -265,22 +270,61 @@ def _op_is_between_two_timedeltas_or_timedelta_and_null( ) +def _is_numeric_non_timedelta_type(datatype: DataType) -> bool: + """ + Whether the datatype is numeric, but not a timedelta type. + + Args: + datatype: The datatype + + Returns: + bool: Whether the datatype is numeric, but not a timedelta type. + """ + return isinstance(datatype, _NumericType) and not isinstance( + datatype, TimedeltaType + ) + + +def _op_is_between_timedelta_and_numeric( + first_datatype: DataTypeGetter, second_datatype: DataTypeGetter +) -> bool: + """ + Whether the binary operation is between a timedelta and a numeric type. + + Returns true if either operand is a timedelta and the other operand is a + non-timedelta numeric. + + Args: + First datatype: Getter for first datatype. + Second datatype: Getter for second datatype. + + Returns: + bool: Whether the binary operation is between a timedelta and a numeric type. + """ + return ( + isinstance(first_datatype(), TimedeltaType) + and _is_numeric_non_timedelta_type(second_datatype()) + ) or ( + _is_numeric_non_timedelta_type(first_datatype()) + and isinstance(second_datatype(), TimedeltaType) + ) + + def compute_binary_op_between_snowpark_columns( op: str, first_operand: SnowparkColumn, - first_datatype: Callable[[], DataType], + first_datatype: DataTypeGetter, second_operand: SnowparkColumn, - second_datatype: Callable[[], DataType], + second_datatype: DataTypeGetter, ) -> SnowparkPandasColumn: """ Compute pandas binary operation for two SnowparkColumns Args: op: pandas operation first_operand: SnowparkColumn for lhs - first_datatype: Callable for Snowpark Datatype for lhs, this is lazy so we can avoid pulling the value if - it is not needed. + first_datatype: Callable for Snowpark Datatype for lhs second_operand: SnowparkColumn for rhs - second_datatype: Callable for Snowpark DateType for rhs, this is lazy so we can avoid pulling the value if + second_datatype: Callable for Snowpark DateType for rhs it is not needed. Returns: @@ -383,34 +427,104 @@ def compute_binary_op_between_snowpark_columns( raise np.core._exceptions._UFuncBinaryResolutionError( # type: ignore[attr-defined] np.multiply, (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")) ) - elif _op_is_between_two_timedeltas_or_timedelta_and_null( + elif op in ( + "eq", + "ne", + "gt", + "ge", + "lt", + "le", + ) and _op_is_between_two_timedeltas_or_timedelta_and_null( first_datatype(), second_datatype() - ) and op in ("eq", "ne", "gt", "ge", "lt", "le", "truediv"): + ): # These operations, when done between timedeltas, work without any # extra handling in `snowpark_pandas_type` or `binary_op_result_column`. - # They produce outputs that are not timedeltas (e.g. numbers for floordiv - # and truediv, and bools for the comparisons). pass + elif op == "mul" and ( + _op_is_between_timedelta_and_numeric(first_datatype, second_datatype) + ): + binary_op_result_column = first_operand * second_operand + snowpark_pandas_type = TimedeltaType() + # For `eq` and `ne`, note that Snowflake will consider 1 equal to + # Timedelta(1) because those two have the same representation in Snowflake, + # so we have to compare types in the client. + elif op == "eq" and ( + _op_is_between_timedelta_and_numeric(first_datatype, second_datatype) + ): + binary_op_result_column = pandas_lit(False) + elif op == "ne" and _op_is_between_timedelta_and_numeric( + first_datatype, second_datatype + ): + binary_op_result_column = pandas_lit(True) elif ( - # equal_null and floordiv for timedelta also work without special - # handling, but we need to exclude them from the above case so we catch - # them in an `elif` clause further down. - op not in ("equal_null", "floordiv") - and ( - ( - isinstance(first_datatype(), TimedeltaType) - and not isinstance(second_datatype(), TimedeltaType) - ) - or ( - not isinstance(first_datatype(), TimedeltaType) - and isinstance(second_datatype(), TimedeltaType) + op in ("truediv", "floordiv") + and isinstance(first_datatype(), TimedeltaType) + and _is_numeric_non_timedelta_type(second_datatype()) + ): + binary_op_result_column = floor(first_operand / second_operand) + snowpark_pandas_type = TimedeltaType() + elif ( + op == "mod" + and isinstance(first_datatype(), TimedeltaType) + and _is_numeric_non_timedelta_type(second_datatype()) + ): + binary_op_result_column = ceil( + compute_modulo_between_snowpark_columns( + first_operand, first_datatype(), second_operand, second_datatype() ) ) + snowpark_pandas_type = TimedeltaType() + elif op in ("add", "sub") and ( + ( + isinstance(first_datatype(), TimedeltaType) + and _is_numeric_non_timedelta_type(second_datatype()) + ) + or ( + _is_numeric_non_timedelta_type(first_datatype()) + and isinstance(second_datatype(), TimedeltaType) + ) + ): + raise TypeError( + "Snowpark pandas does not support addition or subtraction between timedelta values and numeric values." + ) + elif op in ("truediv", "floordiv", "mod") and ( + _is_numeric_non_timedelta_type(first_datatype()) + and isinstance(second_datatype(), TimedeltaType) ): - # We don't support these cases yet. - # TODO(SNOW-1637102): Support this case. + raise TypeError( + "Snowpark pandas does not support dividing numeric values by timedelta values with div (/), mod (%), or floordiv (//)." + ) + elif op in ( + "add", + "sub", + "truediv", + "floordiv", + "mod", + "gt", + "ge", + "lt", + "le", + "ne", + "eq", + ) and ( + ( + isinstance(first_datatype(), TimedeltaType) + and isinstance(second_datatype(), StringType) + ) + or ( + isinstance(second_datatype(), TimedeltaType) + and isinstance(first_datatype(), StringType) + ) + ): + # TODO(SNOW-1646604): Support these cases. ErrorMessage.not_implemented( - f"Snowpark pandas does not yet support the binary operation {op} with a Timedelta column and a non-Timedelta column." + f"Snowpark pandas does not yet support the operation {op} between timedelta and string" + ) + elif op in ("gt", "ge", "lt", "le", "pow", "__or__", "__and__") and ( + _op_is_between_timedelta_and_numeric(first_datatype, second_datatype) + ): + raise TypeError( + f"Snowpark pandas does not support binary operation {op} between timedelta and a non-timedelta type." ) elif op == "floordiv": binary_op_result_column = floor(first_operand / second_operand) @@ -527,7 +641,7 @@ def are_equal_types(type1: DataType, type2: DataType) -> bool: def compute_binary_op_between_snowpark_column_and_scalar( op: str, first_operand: SnowparkColumn, - datatype: Callable[[], DataType], + datatype: DataTypeGetter, second_operand: Scalar, ) -> SnowparkPandasColumn: """ @@ -535,8 +649,7 @@ def compute_binary_op_between_snowpark_column_and_scalar( Args: op: the name of binary operation first_operand: The SnowparkColumn for lhs - datatype: Callable for Snowpark data type, this is lazy so we can avoid pulling the value if - it is not needed. + datatype: Callable for Snowpark data type second_operand: Scalar value Returns: @@ -555,7 +668,7 @@ def compute_binary_op_between_scalar_and_snowpark_column( op: str, first_operand: Scalar, second_operand: SnowparkColumn, - datatype: Callable[[], DataType], + datatype: DataTypeGetter, ) -> SnowparkPandasColumn: """ Compute the binary operation between a scalar and a Snowpark column. @@ -563,7 +676,7 @@ def compute_binary_op_between_scalar_and_snowpark_column( op: the name of binary operation first_operand: Scalar value second_operand: The SnowparkColumn for rhs - datatype: Callable for Snowpark data type, this is lazy so we can avoid pulling the value if + datatype: Callable for Snowpark data type it is not needed. Returns: @@ -581,9 +694,9 @@ def first_datatype() -> DataType: def compute_binary_op_with_fill_value( op: str, lhs: SnowparkColumn, - lhs_datatype: Callable[[], DataType], + lhs_datatype: DataTypeGetter, rhs: SnowparkColumn, - rhs_datatype: Callable[[], DataType], + rhs_datatype: DataTypeGetter, fill_value: Scalar, ) -> SnowparkPandasColumn: """ diff --git a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py index 261903be0f7..67ddfe3abde 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py @@ -2,7 +2,7 @@ # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # from functools import reduce -from typing import Any, Union +from typing import Any, Callable, Union import numpy as np import pandas as native_pd @@ -77,6 +77,11 @@ _NumericType, ) +# This type is for a function that returns a DataType. By using it to lazily +# get a DataType, we can sometimes defer metadata queries until we need to +# check a type. +DataTypeGetter = Callable[[], DataType] + # The order of this mapping is important because the first match in either # direction is used by TypeMapper.to_pandas() and TypeMapper.to_snowflake() NUMPY_SNOWFLAKE_TYPE_PAIRS: list[tuple[Union[type, str], DataType]] = [ diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 831bbcec8f5..9e132f4a140 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -287,6 +287,7 @@ transpose_empty_df, ) from snowflake.snowpark.modin.plugin._internal.type_utils import ( + DataTypeGetter, TypeMapper, column_astype, infer_object_type, @@ -14177,11 +14178,10 @@ def _binary_op_between_dataframe_and_series_along_axis_0( ) ) - # Lazify type map here for calling compute_binary_op_between_snowpark_columns, - # this enables the optimization to pull datatypes only on-demand if needed. + # Lazify type map here for calling compute_binary_op_between_snowpark_columns. def create_lazy_type_functions( identifiers: list[str], - ) -> list[Callable[[], DataType]]: + ) -> list[DataTypeGetter]: """ create functions that return datatype on demand for an identifier. Args: diff --git a/tests/integ/modin/binary/test_timedelta.py b/tests/integ/modin/binary/test_timedelta.py index d9fa20b1a40..a85eab8b8f9 100644 --- a/tests/integ/modin/binary/test_timedelta.py +++ b/tests/integ/modin/binary/test_timedelta.py @@ -14,6 +14,7 @@ from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( + assert_frame_equal, assert_series_equal, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, @@ -114,8 +115,8 @@ def timestamp_scalar(request): @pytest.fixture( params=[ pd.Timedelta("10 days 23:59:59.123456789"), - datetime.timedelta(microseconds=1), - datetime.timedelta(microseconds=2), + pd.Timedelta(microseconds=1), + pd.Timedelta(microseconds=2), pd.Timedelta(nanoseconds=1), pd.Timedelta(nanoseconds=2), pd.Timedelta(nanoseconds=3), @@ -272,6 +273,95 @@ def op_between_timedeltas(request) -> list[str]: return request.param +@pytest.fixture( + params=[ + "mul", + "rmul", + "div", + "truediv", + "floordiv", + "mod", + "eq", + "ne", + ] +) +def op_between_timedelta_and_numeric(request) -> list[str]: + """Valid operations between a timedelta LHS and numeric RHS.""" + return request.param + + +@pytest.fixture( + params=[ + "mul", + "rmul", + "rdiv", + "rtruediv", + "rfloordiv", + "rmod", + "eq", + "ne", + ] +) +def op_between_numeric_and_timedelta(request) -> list[str]: + """Valid operations between a numeric RHS and timedelta LHS.""" + return request.param + + +@pytest.fixture( + params=[ + -2.5, + -2, + -1, + 1, + 0.5, + 1.5, + 2, + 1001, + ] +) +def numeric_scalar_non_null(request) -> list: + return request.param + + +@pytest.fixture +def numeric_dataframes_postive_no_nulls_1_2x2() -> tuple[ + pd.DataFrame, native_pd.DataFrame +]: + return create_test_dfs( + [ + [1, 1.5], + [ + 2, + 1001, + ], + ] + ) + + +@pytest.fixture +def numeric_list_like_postive_no_nulls_1_length_6() -> list: + return [ + 1, + 0.5, + 1.5, + 2, + 1001, + 1000, + ] + + +@pytest.fixture +def numeric_series_positive_no_nulls_1_length_6( + numeric_list_like_postive_no_nulls_1_length_6, +) -> tuple[pd.Series, native_pd.Series]: + return create_test_series(numeric_list_like_postive_no_nulls_1_length_6) + + +@pytest.fixture +def numeric_series_positive_no_nulls_2_length_2() -> tuple[pd.Series, native_pd.Series]: + return create_test_series([0.5, 2.5]) + + class TestInvalid: """ Test invalid binary operations, e.g. subtracting a timestamp from a timedelta. @@ -349,6 +439,132 @@ def test_timedelta_dataframe_multiplied_by_timedelta_scalar_invalid( ), ) + @pytest.mark.parametrize("op", ["add", "radd", "sub", "rsub"]) + @pytest.mark.parametrize("value", [1, 1.5]) + @sql_count_checker(query_count=0) + def test_timedelta_addition_and_subtraction_with_numeric( + self, timedelta_dataframes_1, op, value + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_1, + lambda df: getattr(df, op)(value), + expect_exception=True, + expect_exception_match=re.escape( + "Snowpark pandas does not support addition or subtraction between timedelta values and numeric values." + ), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + @pytest.mark.parametrize("op", ["div", "floordiv", "truediv", "mod"]) + @sql_count_checker(query_count=0) + def test_timedelta_divide_number_dataframe_by_timedelta_scalar(self, op): + eval_snowpark_pandas_result( + *create_test_dfs([1]), + lambda df: getattr(df, op)(pd.Timedelta(1)), + expect_exception=True, + expect_exception_match=re.escape( + "Snowpark pandas does not support dividing numeric values by timedelta values with div (/), mod (%), or floordiv (//)." + ), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + @pytest.mark.parametrize("op", ["rdiv", "rfloordiv", "rtruediv", "rmod"]) + @sql_count_checker(query_count=0) + def test_divide_number_scalar_by_timedelta_dataframe(self, op): + eval_snowpark_pandas_result( + *create_test_dfs([pd.Timedelta(1)]), + lambda df: getattr(df, op)(2), + expect_exception=True, + expect_exception_match=re.escape( + "Snowpark pandas does not support dividing numeric values by timedelta values with div (/), mod (%), or floordiv (//)." + ), + expect_exception_type=TypeError, + assert_exception_equal=False, + ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize("op", ["gt", "ge", "lt", "le"]) + def test_timedelta_less_than_or_greater_than_numeric( + self, timedelta_dataframes_1, op + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_1, + lambda df: getattr(df, op)(1), + expect_exception=True, + expect_exception_type=TypeError, + expect_exception_match=re.escape( + f"Snowpark pandas does not support binary operation {op} between timedelta and a non-timedelta type" + ), + assert_exception_equal=False, + ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize("op", ["pow", "rpow"]) + def test_pow_timedelta_numeric(self, op, timedelta_dataframes_1): + eval_snowpark_pandas_result( + *timedelta_dataframes_1, + lambda df: getattr(df, op)(1), + expect_exception=True, + expect_exception_type=TypeError, + expect_exception_match=re.escape( + "Snowpark pandas does not support binary operation pow between timedelta and a non-timedelta type" + ), + assert_exception_equal=False, + ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize( + "op,error_message_type", + [ + ("pow", "Numeric"), + ("rpow", "Numeric"), + ("__and__", "Boolean"), + ("__rand__", "Boolean"), + ("__or__", "Boolean"), + ("__ror__", "Boolean"), + ], + ) + def test_invalid_ops_between_timedelta_dataframe_and_string_scalar( + self, op, timedelta_dataframes_1, error_message_type + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_1, + lambda df: getattr(df, op)("4 days"), + expect_exception=True, + expect_exception_type=SnowparkSQLException, + expect_exception_match=f"{error_message_type} value '.*' is not recognized", + # pandas raises errors like "TypeError: ufunc 'power' not supported + # for the input types, and the inputs could not be safely coerced + # to any supported types according to the casting rule ''safe''." + # We follow the general pattern for binary operations of not trying + # to match pandas exactly. + assert_exception_equal=False, + ) + + @sql_count_checker(query_count=0) + @pytest.mark.parametrize( + "op,error_pattern", + [ + ("__or__", "__or__"), + ("__and__", "__and__"), + ("__ror__", "__or__"), + ("__rand__", "__and__"), + ], + ) + def test_bitwise_timedelta_numeric(self, op, error_pattern, timedelta_dataframes_1): + eval_snowpark_pandas_result( + *timedelta_dataframes_1, + lambda df: getattr(df, op)(1), + expect_exception=True, + expect_exception_type=TypeError, + expect_exception_match=re.escape( + f"Snowpark pandas does not support binary operation {error_pattern} between timedelta and a non-timedelta type" + ), + assert_exception_equal=False, + ) + class TestNumericEdgeCases: """ @@ -360,7 +576,7 @@ class TestNumericEdgeCases: "lhs,rhs,expected_pandas,expected_snow", [(3, -2, -1, 1), (-3, -2, -1, -1), (-3, 2, 1, -1)], ) - def test_timedelta_mod_with_negative( + def test_timedelta_mod_with_negative_timedelta( self, lhs, rhs, expected_pandas, expected_snow ): """Snowflake sometimes has different behavior for mod with negative numbers.""" @@ -374,6 +590,25 @@ def test_timedelta_mod_with_negative( native_pd.Series(pd.Timedelta(expected_snow)), ) + @sql_count_checker(query_count=1) + @pytest.mark.parametrize( + "lhs,rhs,expected_pandas,expected_snow", + [(3, -2, 1, 1), (-3, -2, -1, -1), (-3, 2, -1, -1)], + ) + def test_timedelta_mod_with_negative_numeric( + self, lhs, rhs, expected_pandas, expected_snow + ): + """Snowflake sometimes has different behavior for mod with negative numbers.""" + snow_series, pandas_series = create_test_series(pd.Timedelta(lhs)) + assert_series_equal( + pandas_series % rhs, + native_pd.Series(pd.Timedelta(expected_pandas)), + ) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_series % rhs, + native_pd.Series(pd.Timedelta(expected_snow)), + ) + @sql_count_checker(query_count=0) def test_divide_timedelta_by_zero_timedelta(self): snow_series, pandas_series = create_test_series(pd.Timedelta(1)) @@ -381,6 +616,15 @@ def test_divide_timedelta_by_zero_timedelta(self): with pytest.raises(SnowparkSQLException, match=re.escape("Division by zero")): (snow_series / pd.Timedelta(0)).to_pandas() + @sql_count_checker(query_count=0) + def test_divide_timdelta_by_zero_integer(self): + snow_series, pandas_series = create_test_series(pd.Timedelta(1)) + assert_series_equal( + pandas_series / 0, native_pd.Series(pd.NaT, dtype="timedelta64[ns]") + ) + with pytest.raises(SnowparkSQLException, match=re.escape("Division by zero")): + (snow_series / 0).to_pandas() + @sql_count_checker(query_count=0) def test_floordiv_timedelta_by_zero_timedelta(self): snow_series, pandas_series = create_test_series(pd.Timedelta(1)) @@ -388,6 +632,15 @@ def test_floordiv_timedelta_by_zero_timedelta(self): with pytest.raises(SnowparkSQLException, match=re.escape("Division by zero")): (snow_series // pd.Timedelta(0)).to_pandas() + @sql_count_checker(query_count=0) + def test_floordiv_timedelta_by_zero_integer(self): + snow_series, pandas_series = create_test_series(pd.Timedelta(1)) + assert_series_equal( + pandas_series // 0, native_pd.Series(pd.NaT, dtype="timedelta64[ns]") + ) + with pytest.raises(SnowparkSQLException, match=re.escape("Division by zero")): + (snow_series // 0).to_pandas() + @sql_count_checker(query_count=1) def test_mod_timedelta_by_zero_timedelta(self): snow_series, pandas_series = create_test_series(pd.Timedelta(1)) @@ -400,6 +653,13 @@ def test_mod_timedelta_by_zero_timedelta(self): native_pd.Series(pd.NaT, dtype="timedelta64[ns]"), ) + @sql_count_checker(query_count=1) + def test_mod_timedelta_by_zero_integer(self): + eval_snowpark_pandas_result( + *create_test_series(pd.Timedelta(1)), + lambda series: series % 0, + ) + @sql_count_checker(query_count=1) @pytest.mark.parametrize("op", ["add", "radd", "sub", "rsub"]) def test_timedelta_plus_or_minus_zero_timedelta(self, timedelta_dataframes_1, op): @@ -594,18 +854,94 @@ def test_timedelta_dataframe_with_timedelta_scalar( lambda df: getattr(df, op_between_timedeltas)(timedelta_scalar_positive), ) - @pytest.mark.xfail( - strict=True, - raises=NotImplementedError, - match="does not yet support the binary operation .* with a Timedelta column and a non-Timedelta column", - reason="SNOW-1637102", - ) + @sql_count_checker(query_count=1) + def test_timedelta_dataframe_with_numeric_scalar( + self, + timedelta_dataframes_postive_no_nulls_1_2x2, + op_between_timedelta_and_numeric, + numeric_scalar_non_null, + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_timedelta_and_numeric)( + numeric_scalar_non_null + ), + ) + + @sql_count_checker(query_count=1) + def test_numeric_dataframe_with_timedelta_scalar( + self, + numeric_dataframes_postive_no_nulls_1_2x2, + op_between_numeric_and_timedelta, + timedelta_scalar_positive, + ): + eval_snowpark_pandas_result( + *numeric_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_numeric_and_timedelta)( + timedelta_scalar_positive + ), + ) + + @sql_count_checker(query_count=1) @pytest.mark.parametrize("op", ["mul", "rmul"]) - def test_timedelta_dataframe_with_integer_scalar( - self, timedelta_dataframes_postive_no_nulls_1_2x2, op + def test_datetime_timedelta_scalar_multiply_with_numeric_dataframe( + self, + numeric_dataframes_postive_no_nulls_1_2x2, + op, ): + """ + Test valid multiplication between numeric dataframe and datetime.timedelta scalar + + We test this case separately from other timedelta scalar cases due to + https://github.com/pandas-dev/pandas/issues/59656 + """ eval_snowpark_pandas_result( - *timedelta_dataframes_postive_no_nulls_1_2x2, lambda df: getattr(df, op)(3) + *numeric_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op)(datetime.timedelta(microseconds=2)), + ) + + @sql_count_checker(query_count=1) + def test_datetime_timedelta_scalar_divide_by_numeric_dataframe( + self, + ): + """ + Test valid division between numeric dataframe and datetime.timedelta scalar + + We test this case separately from other timedelta scalar cases due to + https://github.com/pandas-dev/pandas/issues/59656 + """ + modin_df, pandas_df = create_test_dfs([[1.5, 1001]]) + scalar = datetime.timedelta(microseconds=2) + # Due to https://github.com/pandas-dev/pandas/issues/59656, pandas + # produces a result with microsecond precision, i.e. dtype + # timedelt64[us], so it represents pd.Timedelta(microseconds=2) / + # 1.5 as pd.Timedelta(microseconds=1) instead of as + # pd.Timedelta(nanoseconds=1333). Likewise, it represents + # pd.Timedelta(microseconds=2) / 1001 as + # pd.Timedelta(microseconds=0) instead of as + # pd.Timedelta(nanoseconds=1). + assert_frame_equal( + scalar / pandas_df, + native_pd.DataFrame( + [ + [ + pd.Timedelta(microseconds=1), + pd.Timedelta(microseconds=0), + ], + ], + dtype="timedelta64[us]", + ), + ) + assert_frame_equal( + scalar / modin_df, + native_pd.DataFrame( + [ + [ + pd.Timedelta(nanoseconds=1333), + pd.Timedelta(nanoseconds=1), + ], + ] + ), ) @@ -666,6 +1002,57 @@ def test_timedelta_series_with_timedelta_scalar( ), ) + @sql_count_checker(query_count=1) + def test_timedelta_series_with_numeric_scalar( + self, + timedelta_series_positive_no_nulls_1_length_6, + op_between_timedelta_and_numeric, + numeric_scalar_non_null, + ): + eval_snowpark_pandas_result( + *timedelta_series_positive_no_nulls_1_length_6, + lambda series: getattr(series, op_between_timedelta_and_numeric)( + numeric_scalar_non_null + ), + ) + + @sql_count_checker(query_count=1) + def test_numeric_series_with_timedelta_scalar( + self, + numeric_series_positive_no_nulls_1_length_6, + op_between_numeric_and_timedelta, + timedelta_scalar_positive, + ): + eval_snowpark_pandas_result( + *numeric_series_positive_no_nulls_1_length_6, + lambda series: getattr(series, op_between_numeric_and_timedelta)( + timedelta_scalar_positive + ), + ) + + @pytest.mark.xfail(strict=True, raises=NotImplementedError, reason="SNOW-1646604") + @pytest.mark.parametrize( + "op", + [ + "sub", + "add", + "div", + "truediv", + "eq", + "ne", + ], + ) + @sql_count_checker(query_count=1) + def test_string_series_with_timedelta_scalar(self, timedelta_scalar_positive, op): + # TODO(SNOW-1646604): Test other operand types (e.g. DataFrame of strings + scalar timedelta), + # and all operations in `op_between_timedeltas`. + # However, note that not all of those cases work in pandas due to + # https://github.com/pandas-dev/pandas/issues/59653 + eval_snowpark_pandas_result( + *create_test_series(["1 days", "1 day 1 hour"]), + lambda series: getattr(series, op)(timedelta_scalar_positive), + ) + class TestDataFrameAndListLikeAxis1: @pytest.mark.parametrize("op", ["sub", "rsub"]) @@ -729,6 +1116,38 @@ def test_timedelta_dataframe_with_timedelta_list_like( ), ) + @sql_count_checker(query_count=1) + def test_timedelta_dataframe_with_numeric_list_like( + self, + op_between_timedelta_and_numeric, + timedelta_dataframes_postive_no_nulls_1_2x2, + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_timedelta_and_numeric)( + [ + 1.5, + 2, + ] + ), + ) + + @sql_count_checker(query_count=1) + def test_numeric_dataframe_with_timedelta_list_like( + self, + op_between_numeric_and_timedelta, + numeric_dataframes_postive_no_nulls_1_2x2, + ): + eval_snowpark_pandas_result( + *numeric_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_numeric_and_timedelta)( + [ + pd.Timedelta(days=2, milliseconds=140), + pd.Timedelta(days=1, milliseconds=770), + ] + ), + ) + class TestSeriesAndListLike: @sql_count_checker(query_count=1, join_count=1) @@ -798,6 +1217,40 @@ def test_timedelta_series_and_timedelta_list_like( ), ) + @sql_count_checker(query_count=1, join_count=1) + def test_timedelta_series_and_numeric_list_like( + self, + op_between_timedelta_and_numeric, + timedelta_series_positive_no_nulls_1_length_6, + numeric_list_like_postive_no_nulls_1_length_6, + ): + eval_snowpark_pandas_result( + *timedelta_series_positive_no_nulls_1_length_6, + lambda series: getattr(series, op_between_timedelta_and_numeric)( + numeric_list_like_postive_no_nulls_1_length_6 + ), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_numeric_series_and_timedelta_list_like( + self, + op_between_numeric_and_timedelta, + numeric_series_positive_no_nulls_1_length_6, + ): + eval_snowpark_pandas_result( + *numeric_series_positive_no_nulls_1_length_6, + lambda series: getattr(series, op_between_numeric_and_timedelta)( + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=4), + pd.Timedelta(days=5), + pd.Timedelta(days=6), + ] + ), + ) + class TestDataFrameAndListLikeAxis0: @sql_count_checker(query_count=1, join_count=1) @@ -865,6 +1318,34 @@ def test_timedelta_dataframe_and_timedelta_list_like( ), ) + @sql_count_checker(query_count=1, join_count=1) + def test_timedelta_dataframe_and_numeric_list_like( + self, + timedelta_dataframes_postive_no_nulls_1_2x2, + op_between_timedelta_and_numeric, + ): + eval_snowpark_pandas_result( + *timedelta_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_timedelta_and_numeric)([1.5, 2], axis=0), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_numeric_dataframe_and_timedelta_list_like( + self, + numeric_dataframes_postive_no_nulls_1_2x2, + op_between_numeric_and_timedelta, + ): + eval_snowpark_pandas_result( + *numeric_dataframes_postive_no_nulls_1_2x2, + lambda df: getattr(df, op_between_numeric_and_timedelta)( + [ + pd.Timedelta(nanoseconds=1), + pd.Timedelta(microseconds=2.5), + ], + axis=0, + ), + ) + class TestSeriesAndSeries: @pytest.mark.parametrize( @@ -974,6 +1455,40 @@ def test_timedelta_and_timedleta( lambda inputs: getattr(inputs[0], op_between_timedeltas)(inputs[1]), ) + @sql_count_checker(query_count=1, join_count=1) + def test_timedelta_and_numeric( + self, + op_between_timedelta_and_numeric, + timedelta_series_positive_no_nulls_1_length_6, + numeric_series_positive_no_nulls_1_length_6, + ): + snow_lhs, pandas_lhs = timedelta_series_positive_no_nulls_1_length_6 + snow_rhs, pandas_rhs = numeric_series_positive_no_nulls_1_length_6 + eval_snowpark_pandas_result( + (snow_lhs, snow_rhs), + (pandas_lhs, pandas_rhs), + lambda inputs: getattr(inputs[0], op_between_timedelta_and_numeric)( + inputs[1] + ), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_numeric_and_timedelta( + self, + op_between_numeric_and_timedelta, + numeric_series_positive_no_nulls_1_length_6, + timedelta_series_positive_no_nulls_1_length_6, + ): + snow_lhs, pandas_lhs = numeric_series_positive_no_nulls_1_length_6 + snow_rhs, pandas_rhs = timedelta_series_positive_no_nulls_1_length_6 + eval_snowpark_pandas_result( + (snow_lhs, snow_rhs), + (pandas_lhs, pandas_rhs), + lambda inputs: getattr(inputs[0], op_between_numeric_and_timedelta)( + inputs[1] + ), + ) + class TestDataFrameAndSeriesAxis0: @pytest.mark.parametrize("op", ["sub", "rsub"]) @@ -1054,13 +1569,43 @@ def test_timedelta_dataframe_and_timedelta_series( lambda t: getattr(t[0], op_between_timedeltas)(t[1], axis=0), ) + @sql_count_checker(query_count=1, join_count=1) + def test_timedelta_dataframe_and_numeric_series( + self, + op_between_timedelta_and_numeric, + timedelta_dataframes_postive_no_nulls_1_2x2, + numeric_series_positive_no_nulls_2_length_2, + ): + snow_df, pandas_df = timedelta_dataframes_postive_no_nulls_1_2x2 + snow_series, pandas_series = numeric_series_positive_no_nulls_2_length_2 + eval_snowpark_pandas_result( + (snow_df, snow_series), + (pandas_df, pandas_series), + lambda t: getattr(t[0], op_between_timedelta_and_numeric)(t[1], axis=0), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_numeric_dataframe_and_timedelta_series( + self, + op_between_numeric_and_timedelta, + numeric_dataframes_postive_no_nulls_1_2x2, + timedelta_series_no_nulls_3_length_2, + ): + snow_df, pandas_df = numeric_dataframes_postive_no_nulls_1_2x2 + snow_series, pandas_series = timedelta_series_no_nulls_3_length_2 + eval_snowpark_pandas_result( + (snow_df, snow_series), + (pandas_df, pandas_series), + lambda t: getattr(t[0], op_between_numeric_and_timedelta)(t[1], axis=0), + ) + class TestDataFrameAndSeriesAxis1: - @sql_count_checker( - # One query to materialize the series for the subtraction, and another - # query to materialize the result. - query_count=2 - ) + # One query to materialize the series so we can align it along axis 1, and + # another query to materialize the result of the binary operation. + QUERY_COUNT = 2 + + @sql_count_checker(query_count=QUERY_COUNT) def test_timestamp_dataframe_minus_timestamp_series( self, timestamp_no_timezone_dataframes_3 ): @@ -1098,11 +1643,7 @@ def test_timestamp_dataframe_minus_timestamp_series( ), ) - @sql_count_checker( - # One query to materialize the series for the subtraction, and another - # query to materialize the result. - query_count=2 - ) + @sql_count_checker(query_count=QUERY_COUNT) def test_timestamp_series_minus_timestamp_dataframe( self, timestamp_no_timezone_dataframes_3 ): @@ -1143,11 +1684,7 @@ def test_timestamp_series_minus_timestamp_dataframe( ), ) - @sql_count_checker( - # One query to materialize the series for the operation, and another - # query to materialize the result. - query_count=2 - ) + @sql_count_checker(query_count=QUERY_COUNT) @pytest.mark.parametrize( "op,error_message", [ @@ -1198,11 +1735,7 @@ def test_timestamp_dataframe_plus_timedelta_series( ), ) - @sql_count_checker( - # One query to materialize the series for the subtraction, and another - # query to materialize the result. - query_count=2 - ) + @sql_count_checker(query_count=QUERY_COUNT) def test_timestamp_dataframe_minus_timedelta_series( self, timestamp_no_timezone_dataframes_3 ): @@ -1241,11 +1774,7 @@ def test_timestamp_dataframe_minus_timedelta_series( ), ) - @sql_count_checker( - # One query to materialize the series for the operation, and another - # query to materialize the result. - query_count=2 - ) + @sql_count_checker(query_count=QUERY_COUNT) @pytest.mark.parametrize( "op,error_message", [ @@ -1286,11 +1815,7 @@ def test_timedelta_dataframe_plus_timestamp_series( ), ) - @sql_count_checker( - # One query to materialize the series for the subtraction, and another - # query to materialize the result. - query_count=2 - ) + @sql_count_checker(query_count=QUERY_COUNT) def test_timedelta_dataframe_with_timedelta_series( self, timedelta_dataframes_postive_no_nulls_1_2x2, @@ -1305,6 +1830,36 @@ def test_timedelta_dataframe_with_timedelta_series( lambda t: getattr(t[0], op_between_timedeltas)(t[1]), ) + @sql_count_checker(query_count=QUERY_COUNT) + def test_timedelta_dataframe_and_numeric_series( + self, + op_between_timedelta_and_numeric, + timedelta_dataframes_postive_no_nulls_1_2x2, + numeric_series_positive_no_nulls_2_length_2, + ): + snow_df, pandas_df = timedelta_dataframes_postive_no_nulls_1_2x2 + snow_series, pandas_series = numeric_series_positive_no_nulls_2_length_2 + eval_snowpark_pandas_result( + (snow_df, snow_series), + (pandas_df, pandas_series), + lambda t: getattr(t[0], op_between_timedelta_and_numeric)(t[1]), + ) + + @sql_count_checker(query_count=QUERY_COUNT) + def test_numeric_dataframe_and_timedelta_series( + self, + op_between_numeric_and_timedelta, + numeric_dataframes_postive_no_nulls_1_2x2, + timedelta_series_no_nulls_3_length_2, + ): + snow_df, pandas_df = numeric_dataframes_postive_no_nulls_1_2x2 + snow_series, pandas_series = timedelta_series_no_nulls_3_length_2 + eval_snowpark_pandas_result( + (snow_df, snow_series), + (pandas_df, pandas_series), + lambda t: getattr(t[0], op_between_numeric_and_timedelta)(t[1]), + ) + class TestDataFrameAndDataFrameAxis1: @pytest.mark.parametrize( @@ -1387,7 +1942,7 @@ def test_timedelta_dataframe_plus_timestamp_dataframe(self, op): ) @sql_count_checker(query_count=1, join_count=1) - def test_two_timedelta_datfaframes( + def test_two_timedelta_dataframes( self, timedelta_dataframes_postive_no_nulls_1_2x2, op_between_timedeltas ): snow_lhs, pandas_lhs = timedelta_dataframes_postive_no_nulls_1_2x2 @@ -1405,3 +1960,33 @@ def test_two_timedelta_datfaframes( (pandas_lhs, pandas_rhs), lambda t: getattr(t[0], op_between_timedeltas)(t[1]), ) + + @sql_count_checker(query_count=1, join_count=1) + def test_timedelta_and_numeric( + self, + timedelta_dataframes_postive_no_nulls_1_2x2, + numeric_dataframes_postive_no_nulls_1_2x2, + op_between_timedelta_and_numeric, + ): + snow_lhs, pandas_lhs = timedelta_dataframes_postive_no_nulls_1_2x2 + snow_rhs, pandas_rhs = numeric_dataframes_postive_no_nulls_1_2x2 + eval_snowpark_pandas_result( + (snow_lhs, snow_rhs), + (pandas_lhs, pandas_rhs), + lambda t: getattr(t[0], op_between_timedelta_and_numeric)(t[1]), + ) + + @sql_count_checker(query_count=1, join_count=1) + def test_numeric_and_timedelta( + self, + numeric_dataframes_postive_no_nulls_1_2x2, + timedelta_dataframes_postive_no_nulls_1_2x2, + op_between_numeric_and_timedelta, + ): + snow_lhs, pandas_lhs = numeric_dataframes_postive_no_nulls_1_2x2 + snow_rhs, pandas_rhs = timedelta_dataframes_postive_no_nulls_1_2x2 + eval_snowpark_pandas_result( + (snow_lhs, snow_rhs), + (pandas_lhs, pandas_rhs), + lambda t: getattr(t[0], op_between_numeric_and_timedelta)(t[1]), + )