From 47cc0d0572ad5730c021f608e76b179be105d2f5 Mon Sep 17 00:00:00 2001 From: Jamison Rose Date: Fri, 31 May 2024 14:35:44 -0700 Subject: [PATCH 01/12] SNOW-1437407: [Local Testing] Fix convert_timezone argument order (#1685) --- CHANGELOG.md | 6 +++ src/snowflake/snowpark/mock/_functions.py | 57 ++++++++++++++--------- tests/integ/scala/test_function_suite.py | 46 ++++++++++++++++++ 3 files changed, 86 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 992144f98b6..0c39480739f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ #### Improvements +### Snowpark Local Testing Updates + +#### Bug Fixes + +- Fixed a bug in convert_timezone that made the setting the source_timezone parameter return an error. + ### Snowpark pandas API Updates #### New Features diff --git a/src/snowflake/snowpark/mock/_functions.py b/src/snowflake/snowpark/mock/_functions.py index 1d3ad0d1093..be496fddcf0 100644 --- a/src/snowflake/snowpark/mock/_functions.py +++ b/src/snowflake/snowpark/mock/_functions.py @@ -1700,9 +1700,7 @@ def mock_initcap(values: ColumnEmulator, delimiters: ColumnEmulator): @patch("convert_timezone") def mock_convert_timezone( - target_timezone: ColumnEmulator, - source_time: ColumnEmulator, - source_timezone: Optional[ColumnEmulator] = None, + *args: ColumnEmulator, ) -> ColumnEmulator: """Converts the given source_time to the target timezone. @@ -1710,32 +1708,45 @@ def mock_convert_timezone( """ import dateutil - is_ntz = source_time.sf_type.datatype.tz is TimestampTimeZone.NTZ - if source_timezone is not None and not is_ntz: - SnowparkLocalTestingException.raise_from_error( - ValueError( + # mock_convert_timezone matches the sql function call semantics. + # It has different parameters when called with 2 or 3 args. + # When called with two args, the third will be replaced with None. + if args[2] is None: + target_timezone, source_time, _ = args + source_timezone = pandas.Series([None] * len(source_time)) + return_type = TimestampTimeZone.TZ + else: + source_timezone, target_timezone, source_time = args + return_type = TimestampTimeZone.NTZ + if source_time.sf_type.datatype.tz is not TimestampTimeZone.NTZ: + raise ValueError( "[Local Testing] convert_timezone can only convert NTZ timestamps when source_timezone is specified." ) - ) - # Using dateutil because it uses iana timezones while pytz would use Olson tzdb. - from_tz = None if source_timezone is None else dateutil.tz.gettz(source_timezone) + combined = pandas.concat( + [source_timezone, target_timezone, source_time], axis=1, ignore_index=True + ) - if from_tz is not None: - timestamps = [ts.replace(tzinfo=from_tz) for ts in source_time] - return_type = TimestampTimeZone.NTZ - else: - timestamps = list(source_time) - return_type = TimestampTimeZone.TZ + def _convert(row): + source_timezone, target_timezone, source_time = row + if source_time is None: + return None - res = [] - for tz, ts in zip(target_timezone, timestamps): - # Add local tz if info is missing - if ts.tzinfo is None: - ts = LocalTimezone.replace_tz(ts) + if source_timezone is not None: + # Using dateutil because it uses iana timezones while pytz would use Olson tzdb. + source_time = source_time.replace(tzinfo=dateutil.tz.gettz(source_timezone)) + + if source_time.tzinfo is None: + source_time = LocalTimezone.replace_tz(source_time) + + result = source_time.astimezone(dateutil.tz.gettz(target_timezone)) + + if return_type == TimestampTimeZone.NTZ: + result = result.replace(tzinfo=None) + + return result - # Convert all timestamps to the target tz - res.append(ts.astimezone(dateutil.tz.gettz(tz))) + res = combined.apply(_convert, axis=1) return ColumnEmulator( res, diff --git a/tests/integ/scala/test_function_suite.py b/tests/integ/scala/test_function_suite.py index a2b336c2d4b..d09280ea8db 100644 --- a/tests/integ/scala/test_function_suite.py +++ b/tests/integ/scala/test_function_suite.py @@ -3835,6 +3835,52 @@ def test_convert_timezone(session, local_testing_mode): LocalTimezone.set_local_timezone() +def test_convert_timezone_neg(session): + df = TestData.datetime_primitives1(session) + with pytest.raises(SnowparkSQLException): + df.select( + convert_timezone(lit("UTC"), "timestamp_tz", lit("US/Eastern")) + ).collect() + + +def test_convert_timezone_nulls(session): + null_df = session.create_dataframe([[None]]).to_df("timestamp") + Utils.check_answer( + null_df.select(convert_timezone(lit("UTC"), to_timestamp_ntz("timestamp"))), + [Row(None)], + ) + Utils.check_answer( + null_df.select( + convert_timezone( + lit("UTC"), to_timestamp_ntz("timestamp"), lit("US/Eastern") + ) + ), + [Row(None)], + ) + + +def test_convert_timezone_with_source(session, local_testing_mode): + with parameter_override( + session, + "timezone", + "America/Los_Angeles", + not IS_IN_STORED_PROC and not local_testing_mode, + ): + LocalTimezone.set_local_timezone(pytz.timezone("Etc/GMT+8")) + + df = TestData.datetime_primitives2(session) + + Utils.check_answer( + df.select(convert_timezone(lit("UTC"), "timestamp", lit("US/Eastern"))), + [ + Row(datetime(9999, 12, 31, 5, 0, 0, 123456)), + Row(datetime(1583, 1, 2, 4, 56, 1, 567890)), + ], + ) + + LocalTimezone.set_local_timezone() + + @pytest.mark.skipif( "config.getoption('local_testing_mode', default=False)", reason="time_from_parts is not yet supported in local testing mode.", From 9b32aa7520b6b367ba10ce73bc3c38401f0c8fd4 Mon Sep 17 00:00:00 2001 From: Sophie Tan Date: Fri, 31 May 2024 14:59:42 -0700 Subject: [PATCH 02/12] Add auto PR labeler for local testing label (#1718) --- .github/labeler.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index 8b981ef337c..4c36c741fb1 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,3 +3,9 @@ snowpark-pandas: - any-glob-to-any-file: - src/snowflake/snowpark/modin/** - tests/integ/modin/** + +local-testing: + - changed-files: + - any-glob-to-any-file: + - src/snowflake/snowpark/mock/** + - tests/mock/** From c0e478cf5fb1294e033c5f1893439cc5f76d96cd Mon Sep 17 00:00:00 2001 From: Jamison Rose Date: Fri, 31 May 2024 16:16:15 -0700 Subject: [PATCH 03/12] SNOW-1370054: [Local Testing] Add support for strict udfs/sprocs (#1689) --- CHANGELOG.md | 4 +++ src/snowflake/snowpark/mock/_plan.py | 14 +++++++--- .../snowpark/mock/_stored_procedure.py | 5 ++++ src/snowflake/snowpark/mock/_udf.py | 26 ++++++++++++------- tests/integ/test_stored_procedure.py | 4 --- tests/integ/test_udf.py | 4 --- 6 files changed, 35 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c39480739f..0f12b36afbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ ### Snowpark Local Testing Updates +#### New Features + +- Added support for the `strict` parameter when registering UDFs and Stored Procedures. + #### Bug Fixes - Fixed a bug in convert_timezone that made the setting the source_timezone parameter return an error. diff --git a/src/snowflake/snowpark/mock/_plan.py b/src/snowflake/snowpark/mock/_plan.py index 03d01e17929..9f4f379d397 100644 --- a/src/snowflake/snowpark/mock/_plan.py +++ b/src/snowflake/snowpark/mock/_plan.py @@ -605,11 +605,17 @@ def cleanup_imports(): # And these code would look like: # res=input.apply(...) # res.set_sf_type(ColumnType(exp.datatype, exp.nullable)) # fixes the drift and removes NaT + + data = [] + for _, row in function_input.iterrows(): + if udf.strict and any([v is None for v in row]): + result = None + else: + result = remove_null_wrapper(udf_handler(*row)) + data.append(result) + res = ColumnEmulator( - data=[ - remove_null_wrapper(udf_handler(*row)) - for _, row in function_input.iterrows() - ], + data=data, sf_type=ColumnType(exp.datatype, exp.nullable), name=quote_name( f"{exp.udf_name}({', '.join(input_data.columns)})".upper() diff --git a/src/snowflake/snowpark/mock/_stored_procedure.py b/src/snowflake/snowpark/mock/_stored_procedure.py index 5542636181a..c0e591f0930 100644 --- a/src/snowflake/snowpark/mock/_stored_procedure.py +++ b/src/snowflake/snowpark/mock/_stored_procedure.py @@ -48,8 +48,10 @@ def __init__( imports: Set[str], execute_as: typing.Literal["caller", "owner"] = "owner", anonymous_sp_sql: Optional[str] = None, + strict=False, ) -> None: self.imports = imports + self.strict = strict super().__init__( func, return_type, @@ -66,6 +68,8 @@ def __call__( statement_params: Optional[Dict[str, str]] = None, ) -> Any: args, session = self._validate_call(args, session) + if self.strict and any([arg is None for arg in args]): + return None # Unpack columns if passed parsed_args = [] @@ -344,6 +348,7 @@ def _do_register_sp( sproc_name, sproc_imports, execute_as=execute_as, + strict=strict, ) self._registry[sproc_name] = sproc diff --git a/src/snowflake/snowpark/mock/_udf.py b/src/snowflake/snowpark/mock/_udf.py index 62b14e8277c..71f225a04ae 100644 --- a/src/snowflake/snowpark/mock/_udf.py +++ b/src/snowflake/snowpark/mock/_udf.py @@ -16,6 +16,12 @@ from snowflake.snowpark.udf import UDFRegistration, UserDefinedFunction +class MockUserDefinedFunction(UserDefinedFunction): + def __init__(self, *args, strict=False, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.strict = strict + + class MockUDFRegistration(UDFRegistration): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -121,13 +127,7 @@ def _do_register_udf( raise ValueError("options replace and if_not_exists are incompatible") if udf_name in self._registry and if_not_exists: - return UserDefinedFunction( - self._registry[udf_name], - return_type, - input_types, - udf_name, - packages=packages, - ) + return self._registry[udf_name] if udf_name in self._registry and not replace: raise SnowparkSQLException( @@ -151,17 +151,23 @@ def _do_register_udf( if type(func) is tuple: # register from file module_name = self._import_file(func[0], udf_name=udf_name) - self._registry[udf_name] = UserDefinedFunction( + self._registry[udf_name] = MockUserDefinedFunction( (module_name, func[1]), return_type, input_types, udf_name, + strict=strict, packages=packages, ) else: # register from callable - self._registry[udf_name] = UserDefinedFunction( - func, return_type, input_types, udf_name, packages=packages + self._registry[udf_name] = MockUserDefinedFunction( + func, + return_type, + input_types, + udf_name, + strict=strict, + packages=packages, ) return self._registry[udf_name] diff --git a/tests/integ/test_stored_procedure.py b/tests/integ/test_stored_procedure.py index 254c00b8bb9..b2027a38da7 100644 --- a/tests/integ/test_stored_procedure.py +++ b/tests/integ/test_stored_procedure.py @@ -1690,10 +1690,6 @@ def plus1(_: Session, x: int) -> int: assert "Two sessions specified in arguments" in str(ex_info) -@pytest.mark.skipif( - "config.getoption('local_testing_mode', default=False)", - reason="SNOW-1370044: support strict option for stored procedures in Local Testing", -) def test_strict_stored_procedure(session): @sproc(strict=True) def echo(_: Session, num: int) -> int: diff --git a/tests/integ/test_udf.py b/tests/integ/test_udf.py index 8a9c075778c..be9cc6cd842 100644 --- a/tests/integ/test_udf.py +++ b/tests/integ/test_udf.py @@ -2265,10 +2265,6 @@ def test_deprecate_call_udf_with_list(session, caplog): ) -@pytest.mark.skipif( - "config.getoption('local_testing_mode', default=False)", - reason="SNOW-1370035: support strict UDF in Local Testing", -) def test_strict_udf(session): @udf(strict=True) def echo(num: int) -> int: From 0d30dd813a3655b4c1284385b11c1612111c30bb Mon Sep 17 00:00:00 2001 From: Samedh Desai Date: Fri, 31 May 2024 17:33:06 -0700 Subject: [PATCH 04/12] [SNOW-1359039] Snowpark pandas: create index class that contains native pandas index for now (#1499) Co-authored-by: Adam Ling --- CHANGELOG.md | 1 + .../snowpark/modin/pandas/__init__.py | 4 +- src/snowflake/snowpark/modin/pandas/base.py | 2 +- .../snowpark/modin/pandas/dataframe.py | 22 +- .../snowpark/modin/pandas/general.py | 2 +- .../snowpark/modin/pandas/groupby.py | 2 +- .../snowpark/modin/pandas/indexing.py | 2 +- src/snowflake/snowpark/modin/pandas/series.py | 12 +- src/snowflake/snowpark/modin/pandas/utils.py | 106 +- .../modin/plugin/_internal/apply_utils.py | 3 + .../snowpark/modin/plugin/_internal/frame.py | 13 +- .../snowpark/modin/plugin/_internal/index.py | 1123 +++++++++++++++++ .../modin/plugin/_internal/indexing_utils.py | 12 +- .../modin/plugin/_internal/io_utils.py | 2 +- .../modin/plugin/_internal/transpose_utils.py | 3 +- .../compiler/snowflake_query_compiler.py | 41 +- .../snowpark/modin/plugin/docstrings/base.py | 3 +- .../modin/plugin/docstrings/dataframe.py | 3 +- .../modin/plugin/docstrings/series.py | 3 +- .../modin/plugin/extensions/pd_overrides.py | 4 + tests/integ/modin/binary/test_binary_op.py | 5 +- tests/integ/modin/conftest.py | 80 +- tests/integ/modin/frame/test_axis.py | 26 +- tests/integ/modin/frame/test_getattr.py | 2 +- tests/integ/modin/frame/test_getitem.py | 39 +- tests/integ/modin/frame/test_iloc.py | 37 +- tests/integ/modin/frame/test_insert.py | 8 +- tests/integ/modin/frame/test_loc.py | 120 +- tests/integ/modin/frame/test_mask.py | 6 +- tests/integ/modin/frame/test_name.py | 4 +- tests/integ/modin/frame/test_rename.py | 19 +- tests/integ/modin/frame/test_sample.py | 2 +- tests/integ/modin/frame/test_set_index.py | 33 +- tests/integ/modin/frame/test_setitem.py | 8 +- tests/integ/modin/frame/test_sort_values.py | 2 +- tests/integ/modin/frame/test_where.py | 11 +- .../integ/modin/groupby/test_groupby_apply.py | 10 +- .../modin/groupby/test_groupby_basic_agg.py | 2 +- .../modin/groupby/test_groupby_property.py | 30 +- .../modin/groupby/test_groupby_series.py | 2 +- tests/integ/modin/io/test_read_snowflake.py | 2 +- .../io/test_read_snowflake_select_query.py | 2 +- tests/integ/modin/pivot/test_pivot_dropna.py | 2 +- .../modin/pivot/test_pivot_fill_value.py | 2 +- tests/integ/modin/series/test_getattr.py | 3 +- tests/integ/modin/series/test_getitem.py | 12 +- tests/integ/modin/series/test_iloc.py | 33 +- tests/integ/modin/series/test_isin.py | 9 +- tests/integ/modin/series/test_loc.py | 37 +- tests/integ/modin/series/test_mask.py | 14 +- tests/integ/modin/series/test_rename.py | 10 +- tests/integ/modin/series/test_sample.py | 2 +- tests/integ/modin/series/test_setitem.py | 11 +- tests/integ/modin/series/test_where.py | 13 +- tests/integ/modin/test_concat.py | 2 +- .../integ/modin/test_from_pandas_to_pandas.py | 10 +- tests/integ/modin/test_utils.py | 107 +- tests/integ/modin/utils.py | 93 +- tests/unit/modin/test_class.py | 3 +- tests/unit/modin/test_internal_frame.py | 3 +- tests/unit/modin/test_type_annotations.py | 3 +- 61 files changed, 1895 insertions(+), 287 deletions(-) create mode 100644 src/snowflake/snowpark/modin/plugin/_internal/index.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f12b36afbc..ac93af15836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ - Added support for named aggregations in `DataFrame.aggregate` and `Series.aggregate` with `axis=0`. - `pd.read_csv` reads using the native pandas CSV parser, then uploads data to snowflake using parquet. This enables most of the parameters supported by `read_csv` including date parsing and numeric conversions. Uploading via parquet is roughly twice as fast as uploading via CSV. +- Initial work to support an Index directly in Snowpark pandas. Currently, this class is a simple wrapper for a pandas index. Support for Index as a first-class component of Snowpark pandas is coming soon. ## 1.18.0 (2024-05-28) diff --git a/src/snowflake/snowpark/modin/pandas/__init__.py b/src/snowflake/snowpark/modin/pandas/__init__.py index 5ab6d7a508d..4f0d2162586 100644 --- a/src/snowflake/snowpark/modin/pandas/__init__.py +++ b/src/snowflake/snowpark/modin/pandas/__init__.py @@ -46,7 +46,6 @@ Float32Dtype, Float64Dtype, Grouper, - Index, IndexSlice, Int8Dtype, Int16Dtype, @@ -154,6 +153,9 @@ import snowflake.snowpark.modin.plugin.extensions.pd_extensions as pd_extensions # isort: skip # noqa: E402,F401 import snowflake.snowpark.modin.plugin.extensions.pd_overrides # isort: skip # noqa: E402,F401 +from snowflake.snowpark.modin.plugin.extensions.pd_overrides import ( # isort: skip # noqa: E402,F401 + Index, +) import snowflake.snowpark.modin.plugin.extensions.dataframe_extensions # isort: skip # noqa: E402,F401 import snowflake.snowpark.modin.plugin.extensions.dataframe_overrides # isort: skip # noqa: E402,F401 import snowflake.snowpark.modin.plugin.extensions.series_extensions # isort: skip # noqa: E402,F401 diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index b1673f7659f..36601e60504 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -66,7 +66,6 @@ pandas_dtype, ) from pandas.core.dtypes.inference import is_integer -from pandas.core.indexes.api import ensure_index from pandas.errors import SpecificationError from pandas.util._validators import ( validate_ascending, @@ -77,6 +76,7 @@ from snowflake.snowpark.modin import pandas as pd from snowflake.snowpark.modin.pandas.utils import ( _doc_binary_op, + ensure_index, extract_validate_and_try_convert_named_aggs_from_kwargs, get_as_shape_compatible_dataframe_or_series, is_scalar, diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 60d424c1981..7c7e3359f2f 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -154,6 +154,8 @@ def __init__( # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + self._siblings = [] # Engine.subscribe(_update_engine) @@ -227,9 +229,13 @@ def __init__( if dtype is not None: new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) if index is not None: - new_qc = new_qc.reindex(axis=0, labels=index) + new_qc = new_qc.reindex( + axis=0, labels=try_convert_index_to_native(index) + ) if columns is not None: - new_qc = new_qc.reindex(axis=1, labels=columns) + new_qc = new_qc.reindex( + axis=1, labels=try_convert_index_to_native(columns) + ) self._query_compiler = new_qc return @@ -239,7 +245,11 @@ def __init__( for k, v in data.items() } pandas_df = pandas.DataFrame( - data=data, index=index, columns=columns, dtype=dtype, copy=copy + data=try_convert_index_to_native(data), + index=try_convert_index_to_native(index), + columns=try_convert_index_to_native(columns), + dtype=dtype, + copy=copy, ) self._query_compiler = from_pandas(pandas_df)._query_compiler else: @@ -307,13 +317,13 @@ def _repr_html_(self): # pragma: no cover else: return result - def _get_columns(self) -> pandas.Index: + def _get_columns(self) -> pd.Index: """ Get the columns for this Snowpark pandas ``DataFrame``. Returns ------- - pandas.Index + Index The all columns. """ # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions @@ -2291,7 +2301,7 @@ def set_index( label_or_series.append(key._query_compiler) elif isinstance(key, (np.ndarray, list, Iterator)): label_or_series.append(pd.Series(key)._query_compiler) - elif isinstance(key, pd.Index): + elif isinstance(key, (pd.Index, pandas.MultiIndex)): label_or_series += [ s._query_compiler for s in self._to_series_list(key) ] diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 4807073b41d..bf0667e6365 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1589,7 +1589,7 @@ def to_datetime( else: name = None # keep index name - if isinstance(arg, pandas.Index): + if isinstance(arg, pd.Index): name = arg.name arg = Series(arg) arg.name = name diff --git a/src/snowflake/snowpark/modin/pandas/groupby.py b/src/snowflake/snowpark/modin/pandas/groupby.py index 771034d56d3..2ceee75e73b 100644 --- a/src/snowflake/snowpark/modin/pandas/groupby.py +++ b/src/snowflake/snowpark/modin/pandas/groupby.py @@ -222,7 +222,7 @@ def __bytes__(self): # TODO: since python 3.9: # @cached_property @property - def groups(self) -> PrettyDict[Hashable, pd.Index]: + def groups(self) -> PrettyDict[Hashable, "pd.Index"]: # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions return self._query_compiler.groupby_groups( self._by, diff --git a/src/snowflake/snowpark/modin/pandas/indexing.py b/src/snowflake/snowpark/modin/pandas/indexing.py index e9ffc5e471d..ef7e58cc770 100644 --- a/src/snowflake/snowpark/modin/pandas/indexing.py +++ b/src/snowflake/snowpark/modin/pandas/indexing.py @@ -1173,7 +1173,7 @@ def __setitem__( SET_CELL_WITH_LIST_LIKE_VALUE_ERROR_MESSAGE ) - if isinstance(item, pandas.Index): + if isinstance(item, pd.Index): item = np.array(item.tolist()).transpose() else: item = np.array(item) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 4ca1c2871e1..8b022d5a1f9 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -52,7 +52,11 @@ from snowflake.snowpark.modin.pandas.accessor import CachedAccessor, SparseAccessor from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset from snowflake.snowpark.modin.pandas.iterator import PartitionIterator -from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar +from snowflake.snowpark.modin.pandas.utils import ( + from_pandas, + is_scalar, + try_convert_index_to_native, +) from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -140,7 +144,7 @@ def __init__( if name is None: name = MODIN_UNNAMED_SERIES_LABEL if ( - isinstance(data, (pandas.Series, pandas.Index)) + isinstance(data, (pandas.Series, pandas.Index, pd.Index)) and data.name is not None ): name = data.name @@ -148,8 +152,8 @@ def __init__( query_compiler = from_pandas( pandas.DataFrame( pandas.Series( - data=data, - index=index, + data=try_convert_index_to_native(data), + index=try_convert_index_to_native(index), dtype=dtype, name=name, copy=copy, diff --git a/src/snowflake/snowpark/modin/pandas/utils.py b/src/snowflake/snowpark/modin/pandas/utils.py index 0291121999a..63ef9c0b877 100644 --- a/src/snowflake/snowpark/modin/pandas/utils.py +++ b/src/snowflake/snowpark/modin/pandas/utils.py @@ -20,19 +20,22 @@ # Version 2.0. """Implement utils for pandas component.""" +from __future__ import annotations from collections.abc import Hashable, Iterator, Sequence from types import BuiltinFunctionType -from typing import Any, Callable, Optional, Union +from typing import Any, Callable import numpy as np import pandas from modin.core.storage_formats import BaseQueryCompiler # pragma: no cover +from pandas._libs import lib from pandas._typing import ( AggFuncType, AggFuncTypeBase, AggFuncTypeDict, AnyArrayLike, + Axes, IndexLabel, Scalar, ) @@ -290,7 +293,7 @@ def check_both_not_none(option1, option2): def _walk_aggregation_func( key: IndexLabel, value: AggFuncType, depth: int = 0 -) -> Iterator[tuple[IndexLabel, AggFuncTypeBase, Optional[str], bool]]: +) -> Iterator[tuple[IndexLabel, AggFuncTypeBase, str | None, bool]]: """ Walk over a function from a dictionary-specified aggregation. @@ -342,7 +345,7 @@ def _walk_aggregation_func( def walk_aggregation_dict( agg_dict: AggFuncTypeDict, -) -> Iterator[tuple[IndexLabel, AggFuncTypeBase, Optional[str], bool]]: +) -> Iterator[tuple[IndexLabel, AggFuncTypeBase, str | None, bool]]: """ Walk over an aggregation dictionary. @@ -383,10 +386,8 @@ def raise_if_native_pandas_objects(obj: Any) -> None: def replace_external_data_keys_with_empty_pandas_series( - keys: Optional[ - Union[Hashable, AnyArrayLike, Sequence[Union[Hashable, AnyArrayLike]]] - ] = None -) -> Optional[Union[Hashable, pandas.Series, list[Union[Hashable, pandas.Series]]]]: + keys: None | (Hashable | AnyArrayLike | Sequence[Hashable | AnyArrayLike]) = None, +) -> Hashable | pandas.Series | list[Hashable | pandas.Series] | None: """ Replace any array-like key with empty series. Args: @@ -430,9 +431,7 @@ def create_empty_pandas_series_from_array_like(obj: AnyArrayLike) -> pandas.Seri return obj.head(0) -def create_empty_native_pandas_frame( - obj: Union["pd.Series", "pd.DataFrame"] -) -> pandas.DataFrame: +def create_empty_native_pandas_frame(obj: pd.Series | pd.DataFrame) -> pandas.DataFrame: """ Create an empty native pandas DataFrame using the columns and index labels info from the given object. Empty here implies zero rows. @@ -454,13 +453,9 @@ def create_empty_native_pandas_frame( def replace_external_data_keys_with_query_compiler( - frame: "pd.DataFrame", - keys: Optional[ - Union[Hashable, AnyArrayLike, Sequence[Union[Hashable, AnyArrayLike]]] - ] = None, -) -> Optional[ - Union[Hashable, BaseQueryCompiler, list[Union[Hashable, BaseQueryCompiler]]] -]: + frame: pd.DataFrame, + keys: None | (Hashable | AnyArrayLike | Sequence[Hashable | AnyArrayLike]) = None, +) -> None | (Hashable | BaseQueryCompiler | list[Hashable | BaseQueryCompiler]): """ Replace any array-like join key(s) with query compiler. @@ -497,8 +492,8 @@ def replace_external_data_keys_with_query_compiler( def try_convert_builtin_func_to_str( - fn: Union[AggFuncTypeBase, list[AggFuncTypeBase]], obj: object -) -> Union[AggFuncTypeBase, list[AggFuncTypeBase]]: + fn: AggFuncTypeBase | list[AggFuncTypeBase], obj: object +) -> AggFuncTypeBase | list[AggFuncTypeBase]: """ Try to convert an aggregation function to a string or list of such if the function is a builtin function and supported in the current object dir. @@ -797,12 +792,11 @@ def _doc_binary_op(operation, bin_op, left="Series", right="right", returns="Ser def get_as_shape_compatible_dataframe_or_series( - other: Union["pd.DataFrame", "pd.Series", Callable, AnyArrayLike, Scalar], - reference_df: "pd.DataFrame", - shape_mismatch_message: Optional[ - str - ] = "Array conditional must be same shape as self", -) -> Union["pd.DataFrame", "pd.Series"]: + other: pd.DataFrame | pd.Series | Callable | AnyArrayLike | Scalar, + reference_df: pd.DataFrame, + shape_mismatch_message: None + | (str) = "Array conditional must be same shape as self", +) -> pd.DataFrame | pd.Series: """ Get the "other" type as a shape compatible dataframe or series using the reference_df as a reference for compatible shape and construction. If there is no shape on the other type then wrap as a numpy array. @@ -837,3 +831,65 @@ def get_as_shape_compatible_dataframe_or_series( _original_pandas_MultiIndex_from_frame = pandas.MultiIndex.from_frame pandas.MultiIndex.from_frame = from_modin_frame_to_mi + + +def ensure_index( + index_like: Axes | pd.Index | pd.Series, copy: bool = False +) -> pd.Index | pandas.MultiIndex: + """ + Ensure that we have an index from some index-like object. + + Parameters + ---------- + index_like : sequence + An Index or other sequence + copy : bool, default False + + Returns + ------- + Index + + Examples + -------- + >>> ensure_index(['a', 'b']) + Index(['a', 'b'], dtype='object') + + >>> ensure_index([('a', 'a'), ('b', 'c')]) + Index([('a', 'a'), ('b', 'c')], dtype='object') + """ + # if we have an index object already, simply copy it if required and return + if isinstance(index_like, (pandas.MultiIndex, pd.Index)): + if copy: + index_like = index_like.copy() + return index_like + + if isinstance(index_like, list): + # if we have a non-empty list that is multi dimensional, convert this to a multi-index and return + if len(index_like) and lib.is_all_arraylike(index_like): + return pandas.MultiIndex.from_arrays(index_like) + else: + # otherwise, we have a one dimensional index, so set tupleize_cols=False and return a pd.Index + return pd.Index(index_like, copy=copy, tupleize_cols=False) + else: + return pd.Index(index_like, copy=copy) + + +def try_convert_index_to_native(index_like: Any) -> Any: + """ + Try to convert the given item to a native pandas Index. + This conversion is only performed if `index_like` is a Snowpark pandas Index. Otherwise, the original input will be returned. + + Parameters + ---------- + index_like : Any + An index-like object, such as a list, ndarray or Index object that we would like to try to convert to pandas Index + + Return + ---------- + A pandas Index if index_like is a Snowpark pandas Index, otherwise return index_like + """ + from snowflake.snowpark.modin.plugin._internal.index import Index + + if isinstance(index_like, Index): + index_like = index_like.to_pandas() + return index_like diff --git a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py index c6d019f295f..5127cafbc25 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/apply_utils.py @@ -490,6 +490,9 @@ def create_udtf_for_groupby_apply( # Get the length of this list outside the vUDTF function because the vUDTF # doesn't have access to the Snowpark module, which defines these types. num_by = len(by_types) + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + + data_column_index = try_convert_index_to_native(data_column_index) class ApplyFunc: def end_partition(self, df: native_pd.DataFrame): # type: ignore[no-untyped-def] # pragma: no cover: adding type hint causes an error when creating udtf. also, skip coverage for this function because coverage tools can't tell that we're executing this function because we execute it in a UDTF. diff --git a/src/snowflake/snowpark/modin/plugin/_internal/frame.py b/src/snowflake/snowpark/modin/plugin/_internal/frame.py index 9cf3fd9d4c1..64f76afb4e5 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/frame.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/frame.py @@ -7,7 +7,7 @@ from logging import getLogger from typing import Any, Callable, NamedTuple, Optional, Union -import pandas as pd +import pandas as native_pd from pandas._typing import IndexLabel from pandas.core.dtypes.common import is_object_dtype @@ -16,6 +16,7 @@ ) from snowflake.snowpark.column import Column as SnowparkColumn from snowflake.snowpark.functions import col, last_value +from snowflake.snowpark.modin import pandas as pd from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import ( OrderedDataFrame, OrderingColumn, @@ -369,14 +370,14 @@ def is_unnamed_series(self) -> bool: ) @property - def data_columns_index(self) -> pd.Index: + def data_columns_index(self) -> "pd.Index": """ Returns pandas Index object for column index (df.columns). We can't do the same thing for df.index here because it requires pulling the data from snowflake and filing a query to snowflake. """ if self.is_multiindex(axis=1): - return pd.MultiIndex.from_tuples( + return native_pd.MultiIndex.from_tuples( self.data_column_pandas_labels, names=self.data_column_pandas_index_names, ) @@ -391,7 +392,7 @@ def data_columns_index(self) -> pd.Index: ) @property - def index_columns_index(self) -> pd.Index: + def index_columns_index(self) -> native_pd.Index: """ Get pandas index. The method eagerly pulls the values from Snowflake because index requires the values to be filled @@ -407,7 +408,7 @@ def index_columns_index(self) -> pd.Index: ).values if self.is_multiindex(axis=0): value_tuples = [tuple(row) for row in index_values] - return pd.MultiIndex.from_tuples( + return native_pd.MultiIndex.from_tuples( value_tuples, names=self.index_column_pandas_labels ) else: @@ -416,7 +417,7 @@ def index_columns_index(self) -> pd.Index: index_type = TypeMapper.to_pandas( self.quoted_identifier_to_snowflake_type()[index_identifier] ) - ret = pd.Index( + ret = native_pd.Index( [row[0] for row in index_values], name=self.index_column_pandas_labels[0], # setting tupleize_cols=False to avoid creating a MultiIndex diff --git a/src/snowflake/snowpark/modin/plugin/_internal/index.py b/src/snowflake/snowpark/modin/plugin/_internal/index.py new file mode 100644 index 00000000000..b33b715f49a --- /dev/null +++ b/src/snowflake/snowpark/modin/plugin/_internal/index.py @@ -0,0 +1,1123 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +# Code in this file may constitute partial or total reimplementation, or modification of +# existing code originally distributed by the Modin project, under the Apache License, +# Version 2.0. + +"""Module houses ``Index`` class, that is distributed version of ``pandas.Index``.""" + +from __future__ import annotations + +from typing import Any, Callable, Hashable, Iterator, Literal + +import numpy as np +import pandas as native_pd +from pandas._typing import ArrayLike, DtypeObj, NaPosition, Self +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.indexes.frozen import FrozenList + +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native +from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage + + +class Index: + """ + Immutable sequence used for indexing and alignment. + + The basic object storing axis labels for all pandas objects. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Index. If not specified, this will be + inferred from `data`. + See the :ref:`user guide ` for more usages. + copy : bool, default False + Copy input data. + name : object + Name to be stored in the index. + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible. + + Notes + ----- + An Index instance can **only** contain hashable objects. + An Index instance *can not* hold numpy float16 dtype. + + Examples + -------- + >>> pd.Index([1, 2, 3]) + Index([1, 2, 3], dtype='int64') + + >>> pd.Index(list('abc')) + Index(['a', 'b', 'c'], dtype='object') + + >>> pd.Index([1, 2, 3], dtype="uint8") + Index([1, 2, 3], dtype='uint8') + """ + + # same fields as native pandas index constructor + def __init__( + self, + # Any should be replaced with SnowflakeQueryCompiler when possible (linter won't allow it now) + data: ArrayLike | Any = None, + dtype: str | np.dtype | ExtensionDtype | None = None, + copy: bool = False, + name: object = None, + tupleize_cols: bool = True, + ) -> None: + from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( + SnowflakeQueryCompiler, + ) + + # TODO: SNOW-1359041: Switch to lazy index implementation + if isinstance(data, native_pd.Index): + self._index = data + elif isinstance(data, Index): + self._index = data.to_pandas() + elif isinstance(data, SnowflakeQueryCompiler): + self._index = data._modin_frame.index_columns_index + else: + self._index = native_pd.Index( + data=data, + dtype=dtype, + copy=copy, + name=name, + tupleize_cols=tupleize_cols, + ) + + def to_pandas(self) -> native_pd.Index: + """ + Convert Snowpark pandas Index to pandas Index + + Returns + ------- + pandas Index + A native pandas Index representation of self + """ + return self._index + + @property + def values(self) -> ArrayLike: + """ + Return an array representing the data in the Index. + + Returns + ------- + numpy.ndarray or ExtensionArray + array representing the index data + + See Also + -------- + Index.array : Reference to the underlying data. + + Examples + -------- + For :class:`pd.Index`: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.values + array([1, 2, 3]) + """ + return self.to_pandas().values + + @property + def is_unique(self) -> bool: + """ + Return if the index has unique values. + + Returns + ------- + bool + True if the index has all unique values, False otherwise. + + See Also + -------- + Index.has_duplicates : Inverse method that checks if it has duplicate values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.is_unique + False + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.is_unique + True + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_unique + False + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_unique + True + """ + self.to_pandas_warning() + return self.to_pandas().is_unique + + @property + def has_duplicates(self) -> bool: + """ + Check if the Index has duplicate values. + + Returns + ------- + bool + True if the index has duplicate values, False otherwise. + + See Also + -------- + Index.is_unique : Inverse method that checks if it has unique values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.has_duplicates + True + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.has_duplicates + False + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + True + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + False + """ + return not self.is_unique + + @property + def dtype(self) -> DtypeObj: + """ + Get the dtype object of the underlying data. + + Returns + ------- + DtypeObj + The dtype of the underlying data. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.dtype + dtype('int64') + """ + self.to_pandas_warning() + return self.to_pandas().dtype + + def astype(self, dtype: Any, copy: bool = True) -> Index: + """ + Create an Index with values cast to dtypes. + + The class of a new Index is determined by dtype. When conversion is + impossible, a TypeError exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + Note that any signed integer `dtype` is treated as ``'int64'``, + and any unsigned integer `dtype` is treated as ``'uint64'``, + regardless of the size. + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.astype('float') + Index([1.0, 2.0, 3.0], dtype='float64') + """ + self.to_pandas_warning() + return Index(self.to_pandas().astype(dtype=dtype, copy=copy)) + + @property + def name(self) -> Hashable: + """ + Get the index name. + + Returns + ------- + Hashable + name of this index + + Examples + -------- + >>> idx = pd.Index([1, 2, 3], name='x') + >>> idx + Index([1, 2, 3], dtype='int64', name='x') + >>> idx.name + 'x' + """ + self.to_pandas_warning() + return self.to_pandas().name + + @name.setter + def name(self, value: Hashable) -> None: + """ + Set Index name. + """ + self.to_pandas_warning() + self.to_pandas().name = value + + def _get_names(self) -> FrozenList: + """ + Get names of index + """ + self.to_pandas_warning() + return self.to_pandas()._get_names() + + def _set_names(self, values: list) -> None: + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + + Raises + ------ + TypeError if each name is not hashable. + """ + self.to_pandas_warning() + self.to_pandas()._set_names(values) + + names = property(fset=_set_names, fget=_get_names) + + def set_names( + self, names: Any, level: Any = None, inplace: bool = False + ) -> Self | None: + """ + Set Index name. + + Able to set new names partially and by level. + + Parameters + ---------- + names : label or list of label or dict-like for MultiIndex + Name(s) to set. + + level : int, label or list of int or label, optional + + inplace : bool, default False + Modifies the object directly, instead of creating a new Index. + + Returns + ------- + Index or None + The same type as the caller or None if ``inplace=True``. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Index([1, 2, 3, 4], dtype='int64', name='quarter') + """ + self.to_pandas_warning() + if not inplace: + return Index( + self.to_pandas().set_names(names, level=level, inplace=inplace) + ) + return self.to_pandas().set_names(names, level=level, inplace=inplace) + + @property + def nlevels(self) -> int: + """ + Number of levels. + """ + return 1 + + def copy( + self, + name: Hashable | None = None, + deep: bool = False, + ) -> Index: + """ + Make a copy of this object. + + Name is set on the new object. + + Parameters + ---------- + name : Label, optional + Set name for new object. + deep : bool, default False + + Returns + ------- + Index + Index refer to new object which is a copy of this object. + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> new_idx = idx.copy() + >>> idx is new_idx + False + """ + self.to_pandas_warning() + return Index(self.to_pandas().copy(deep=deep, name=name)) + + def drop( + self, + labels: Any, + errors: Literal["ignore", "raise"] = "raise", + ) -> Index: + """ + Make new Index with passed list of labels deleted. + + Parameters + ---------- + labels : array-like or scalar + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + + Returns + ------- + Index + The index created will have the same type as self. + + Raises + ------ + KeyError + If not all of the labels are found in the selected axis + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.drop(['a']) + Index(['b', 'c'], dtype='object') + """ + self.to_pandas_warning() + return Index(self.to_pandas().drop(labels=labels, errors=errors)) + + def duplicated(self, keep: Literal["first", "last", False] = "first") -> Any: + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + np.ndarray[bool] + An array where duplicated values are indicated as ``True`` + + See Also + -------- + Series.duplicated : Equivalent method on pandas.Series. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + """ + self.to_pandas_warning() + return self.to_pandas().duplicated(keep=keep) + + def equals(self, other: Any) -> bool: + """ + Determine if two Index object are equal. + + The things that are being compared are: + + * The elements inside the Index object. + * The order of the elements inside the Index object. + + Parameters + ---------- + other : Any + The other object to compare against. + + Returns + ------- + bool + True if "other" is an Index and it has the same elements and order + as the calling index; False otherwise. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx1 + Index([1, 2, 3], dtype='int64') + >>> idx1.equals(pd.Index([1, 2, 3])) + True + + The elements inside are compared + + >>> idx2 = pd.Index(["1", "2", "3"]) + >>> idx2 + Index(['1', '2', '3'], dtype='object') + + >>> idx1.equals(idx2) + False + + The order is compared + + >>> ascending_idx = pd.Index([1, 2, 3]) + >>> ascending_idx + Index([1, 2, 3], dtype='int64') + >>> descending_idx = pd.Index([3, 2, 1]) + >>> descending_idx + Index([3, 2, 1], dtype='int64') + >>> ascending_idx.equals(descending_idx) + False + + The dtype is *not* compared + + >>> int64_idx = pd.Index([1, 2, 3], dtype='int64') + >>> int64_idx + Index([1, 2, 3], dtype='int64') + >>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64') + >>> uint64_idx + Index([1, 2, 3], dtype='uint64') + >>> int64_idx.equals(uint64_idx) + True + """ + self.to_pandas_warning() + return self.to_pandas().equals(try_convert_index_to_native(other)) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins: Any = None, + dropna: bool = True, + ) -> native_pd.Series: + # how to change the above return type to modin pandas series? + """ + Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : bool, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : bool, default True + Sort by frequencies when True. Preserve the order of the data when False. + ascending : bool, default False + Sort in ascending order. + bins : int, optional + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with numeric data. + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + Series + A series containing counts of unique values. + + See Also + -------- + Series.count: Number of non-NA elements in a Series. + DataFrame.count: Number of non-NA elements in a DataFrame. + DataFrame.value_counts: Equivalent method on DataFrames. + + Examples + -------- + >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) + >>> index.value_counts() + 3.0 2 + 1.0 1 + 2.0 1 + 4.0 1 + Name: count, dtype: int64 + + With `normalize` set to `True`, returns the relative frequency by + dividing all values by the sum of values. + + >>> ind = pd.Index([3, 1, 2, 3, 4, np.nan]) + >>> ind.value_counts(normalize=True) + 3.0 0.4 + 1.0 0.2 + 2.0 0.2 + 4.0 0.2 + Name: proportion, dtype: float64 + + **bins** + + Bins can be useful for going from a continuous variable to a + categorical variable; instead of counting unique + apparitions of values, divide the index in the specified + number of half-open bins. + """ + self.to_pandas_warning() + return self.to_pandas().value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + + def tolist(self) -> list: + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + The index values in list form + + See Also + -------- + numpy.ndarray.tolist : Return the array as an a.ndim-levels deep + nested list of Python scalars. + + Examples + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + + >>> idx.to_list() + [1, 2, 3] + """ + return self.to_pandas().tolist() + + to_list = tolist + + def sort_values( + self, + return_indexer: bool = False, + ascending: bool = True, + na_position: NaPosition = "last", + key: Callable | None = None, + ) -> Index | tuple[Index, np.ndarray]: + """ + Return a sorted copy of the index. + + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. + + Returns + ------- + Index, numpy.ndarray + Index is returned in all cases as a sorted copy of the index. + ndarray is returned when return_indexer is True, represents the indices that the index itself was sorted by. + + See Also + -------- + Series.sort_values : Sort values of a Series. + DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + self.to_pandas_warning() + ret = self.to_pandas().sort_values( + return_indexer=return_indexer, + ascending=ascending, + na_position=na_position, + key=key, + ) + if return_indexer: + return Index(ret[0]), ret[1] + else: + return Index(ret) + + def intersection(self, other: Any, sort: bool = False) -> Index: + """ + Form the intersection of two Index objects. + + This returns a new Index with elements common to the index and `other`. + + Parameters + ---------- + other : Index or array-like + sort : True, False or None, default False + Whether to sort the resulting index. + + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + * False : do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + A new Index with elements common to the index and `other`. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Index([3, 4], dtype='int64') + """ + self.to_pandas_warning() + return Index( + self.to_pandas().intersection( + other=try_convert_index_to_native(other), sort=sort + ) + ) + + def union(self, other: Any, sort: bool = False) -> Index: + """ + Form the union of two Index objects. + + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + The Index that represents the union between the two indexes + + Examples + -------- + Union matching dtypes + + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') + """ + self.to_pandas_warning() + return Index( + self.to_pandas().union(other=try_convert_index_to_native(other), sort=sort) + ) + + def difference(self, other: Any, sort: Any = None) -> Index: + """ + Return a new Index with elements of index not in `other`. + + This is the set difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + An index object that represents the difference between the two indexes. + + Examples + -------- + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Index([2, 1], dtype='int64') + """ + self.to_pandas_warning() + return Index( + self.to_pandas().difference(try_convert_index_to_native(other), sort=sort) + ) + + def get_indexer_for(self, target: Any) -> Any: + """ + Guaranteed return of an indexer even when non-unique. + + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. + + Returns + ------- + np.ndarray[np.intp] + List of indices. + + Examples + -------- + >>> idx = pd.Index([np.nan, 'var1', np.nan]) + >>> idx.get_indexer_for([np.nan]) + array([0, 2]) + """ + self.to_pandas_warning() + return self.to_pandas().get_indexer_for(target=target) + + def _get_indexer_strict(self, key: Any, axis_name: str) -> tuple[Index, np.ndarray]: + """ + Analogue to pandas.Index.get_indexer that raises if any elements are missing. + """ + self.to_pandas_warning() + tup = self.to_pandas()._get_indexer_strict(key=key, axis_name=axis_name) + return Index(tup[0]), tup[1] + + def get_level_values(self, level: int | str) -> Index: + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + Index + self, since self only has one level + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self.to_pandas_warning() + return Index(self.to_pandas().get_level_values(level=level)) + + def slice_indexer( + self, + start: Hashable | None = None, + end: Hashable | None = None, + step: int | None = None, + ) -> slice: + """ + Compute the slice indexer for input labels and step. + + Index needs to be ordered and unique. + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning. + end : label, default None + If None, defaults to the end. + step : int, default None + + Returns + ------- + slice + The slice of indices + + Raises + ------ + KeyError + If key does not exist, or key is not unique and index is not ordered. + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + + Examples + -------- + This is a method on all index types. For example you can do: + + >>> idx = pd.Index(list('abcd')) + >>> idx.slice_indexer(start='b', end='c') + slice(1, 3, None) + """ + self.to_pandas_warning() + return self.to_pandas().slice_indexer(start=start, end=end, step=step) + + @property + def array(self) -> ExtensionArray: + """ + return the array of values + """ + return self.to_pandas().array + + def _summary(self, name: Any = None) -> str: + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + str + String with a summarized representation of the index + """ + self.to_pandas_warning() + return self.to_pandas()._summary(name=name) + + def __array__(self, dtype: Any = None) -> np.ndarray: + """ + The array interface, return the values. + """ + return self.to_pandas().__array__(dtype=dtype) + + def __repr__(self) -> str: + """ + Return a string representation for this object. + """ + return self.to_pandas().__repr__() + + def __iter__(self) -> Iterator: + """ + Return an iterator of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + Iterator + Iterator of the index values + + Examples + -------- + >>> i = pd.Index([1, 2, 3]) + >>> for x in i: + ... print(x) + 1 + 2 + 3 + """ + self.to_pandas_warning() + return self.to_pandas().__iter__() + + def __contains__(self, key: Any) -> bool: + """ + Return a boolean indicating whether the provided key is in the index. + + Parameters + ---------- + key : label + The key to check if it is present in the index. + + Returns + ------- + bool + True if the key is in the index, False otherwise. + + Raises + ------ + TypeError + If the key is not hashable. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Index([1, 2, 3, 4], dtype='int64') + + >>> 2 in idx + True + >>> 6 in idx + False + """ + self.to_pandas_warning() + return self.to_pandas().__contains__(key=key) + + def __len__(self) -> int: + """ + Return the length of the Index as an int. + """ + self.to_pandas_warning() + return self.to_pandas().__len__() + + def __getitem__(self, key: Any) -> np.ndarray | None | Index: + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + """ + self.to_pandas_warning() + return self.to_pandas().__getitem__(key=key) + + def __setitem__(self, key: Any, value: Any) -> None: + """ + Override numpy.ndarray's __setitem__ method to work as desired. + + We raise a TypeError because the Index values are not mutable + """ + raise TypeError("Index does not support mutable operations") + + @property + def str(self) -> str: + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + + Examples + -------- + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object + """ + self.to_pandas_warning() + return self.to_pandas().str + + def to_pandas_warning(self) -> None: + """ + Helper method to notify users if they are using a method that currently calls to_pandas() + """ + WarningMessage.single_warning( + "This method currently calls to_pandas() and materializes data. In future updates, this method will be lazily evaluated" + ) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py index ba1953a54ad..ef64b29448f 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py @@ -7,7 +7,7 @@ from typing import Any, Literal, Optional, Union import numpy as np -import pandas +import pandas as native_pd from pandas._typing import AnyArrayLike, Scalar from pandas.api.types import is_list_like from pandas.core.common import is_bool_indexer @@ -708,7 +708,7 @@ def _extract_loc_set_col_info( tuple, slice, list, - pd.Index, + "pd.Index", np.ndarray, ], ) -> LocSetColInfo: @@ -821,7 +821,7 @@ def get_valid_col_positions_from_col_labels( tuple, slice, list, - pd.Index, + "pd.Index", np.ndarray, ], ) -> list[int]: @@ -932,7 +932,7 @@ def get_valid_col_positions_from_col_labels( col_loc = [col_loc] if col_loc in columns else list(col_loc) # Throw a KeyError in case there are any missing column labels if len(col_loc) > 0 and all(label not in columns for label in col_loc): - raise KeyError(f"None of {pandas.Index(col_loc)} are in the [columns]") + raise KeyError(f"None of {native_pd.Index(col_loc)} are in the [columns]") elif any(label not in columns for label in col_loc): raise KeyError(f"{[k for k in col_loc if k not in columns]} not in index") # Convert col_loc to Index with object dtype since _get_indexer_strict() converts None values in lists to @@ -955,7 +955,7 @@ def get_frame_by_col_label( tuple, slice, list, - pd.Index, + "pd.Index", np.ndarray, ], ) -> InternalFrame: @@ -2162,7 +2162,7 @@ def set_frame_2d_labels( tuple, slice, list, - pd.Index, + "pd.Index", np.ndarray, ], item: Union[Scalar, AnyArrayLike, InternalFrame], diff --git a/src/snowflake/snowpark/modin/plugin/_internal/io_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/io_utils.py index 2103a63caec..c883be09afc 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/io_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/io_utils.py @@ -98,7 +98,7 @@ def get_non_pandas_kwargs(kwargs: Any) -> Any: def get_columns_to_keep_for_usecols( usecols: Union[Callable, list[str], list[int]], - columns: pd.Index, + columns: "pd.Index", maintain_usecols_order: bool = False, ) -> list[Hashable]: """ diff --git a/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py index 68608352e74..34abb0100aa 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/transpose_utils.py @@ -40,6 +40,7 @@ def transpose_empty_df( original_frame: InternalFrame, ) -> "SnowflakeQueryCompiler": # type: ignore[name-defined] # noqa: F821 + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -47,7 +48,7 @@ def transpose_empty_df( return SnowflakeQueryCompiler.from_pandas( native_pd.DataFrame( columns=original_frame.index_columns_index, - index=original_frame.data_columns_index, + index=try_convert_index_to_native(original_frame.data_columns_index), ) ) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index c1cac64af72..481af458bf7 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -55,7 +55,6 @@ ) from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_dict_like, is_list_like, pandas_dtype -from pandas.core.indexes.api import ensure_index from pandas.io.formats.format import format_percentiles from pandas.io.formats.printing import PrettyDict @@ -372,8 +371,13 @@ def dtypes(self) -> native_pd.Series: TypeMapper.to_pandas(col_to_type[c]) for c in self._modin_frame.data_column_snowflake_quoted_identifiers ] + + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + return native_pd.Series( - data=types, index=self._modin_frame.data_columns_index, dtype=object + data=types, + index=try_convert_index_to_native(self._modin_frame.data_columns_index), + dtype=object, ) @property @@ -646,8 +650,12 @@ def to_pandas( self._modin_frame.index_column_pandas_labels ) + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + # set column names and potential casting - native_df.columns = self._modin_frame.data_columns_index + native_df.columns = try_convert_index_to_native( + self._modin_frame.data_columns_index + ) return native_df def finalize(self) -> None: @@ -1230,7 +1238,7 @@ def cache_result(self) -> "SnowflakeQueryCompiler": return SnowflakeQueryCompiler(self._modin_frame.persist_to_temporary_table()) @property - def columns(self) -> native_pd.Index: + def columns(self) -> "pd.Index": """ Get pandas column labels. @@ -1251,6 +1259,8 @@ def set_columns(self, new_pandas_labels: Axes) -> "SnowflakeQueryCompiler": a new `SnowflakeQueryCompiler` with updated column labels """ # new_pandas_names should be able to convert into an index which is consistent to pandas df.columns behavior + from snowflake.snowpark.modin.pandas.utils import ensure_index + new_pandas_labels = ensure_index(new_pandas_labels) if len(new_pandas_labels) != len(self._modin_frame.data_column_pandas_labels): raise ValueError( @@ -1481,7 +1491,7 @@ def shift( return self._shift_index(periods, freq) # type: ignore # pragma: no cover @property - def index(self) -> pd.Index: + def index(self) -> Union["pd.Index", native_pd.MultiIndex]: """ Get pandas index. The method eagerly pulls the values from Snowflake because index requires the values to be filled @@ -1489,7 +1499,10 @@ def index(self) -> pd.Index: Returns: The index (row labels) of the DataFrame. """ - return self._modin_frame.index_columns_index + if self.is_multiindex(): + return self._modin_frame.index_columns_index + else: + return pd.Index(self) def _is_scalar_in_index(self, scalar: Union[Scalar, tuple]) -> bool: """ @@ -3627,7 +3640,7 @@ def groupby_groups( by: Any, axis: int, groupby_kwargs: dict[str, Any], - ) -> PrettyDict[Hashable, pd.Index]: + ) -> PrettyDict[Hashable, "pd.Index"]: """ Get a PrettyDict mapping group keys to row labels. @@ -3716,7 +3729,7 @@ def groupby_groups( # # into {2: pd.Index([0, 4]), 9: pd.Index([0])} aggregated_as_pandas.iloc[:, 0].map( - lambda v: pd.Index( + lambda v: native_pd.Index( v, # note that the index dtype has to match the original # index's dtype, even if we could use a more restrictive @@ -3742,7 +3755,7 @@ def groupby_groups( # note that the index dtype has to match the original # index's dtype, even if we could use a more restrictive # type for this portion of the index. - pd.Index( + native_pd.Index( row.iloc[i], name=original_index_name, dtype=index_dtype, @@ -6311,8 +6324,10 @@ def apply( in self._modin_frame.data_column_snowflake_quoted_identifiers ] + from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + # current columns - column_index = self._modin_frame.data_columns_index + column_index = try_convert_index_to_native(self._modin_frame.data_columns_index) # Extract return type from annotations (or lookup for known pandas functions) for func object, # if not return type could be extracted the variable will hold None. @@ -7032,10 +7047,10 @@ def to_numeric( def take_2d_labels( self, index: Union[ - "SnowflakeQueryCompiler", Scalar, tuple, slice, list, pd.Index, np.ndarray + "SnowflakeQueryCompiler", Scalar, tuple, slice, list, "pd.Index", np.ndarray ], columns: Union[ - "SnowflakeQueryCompiler", Scalar, slice, list, pd.Index, np.ndarray + "SnowflakeQueryCompiler", Scalar, slice, list, "pd.Index", np.ndarray ], ) -> "SnowflakeQueryCompiler": """ @@ -7443,7 +7458,7 @@ def set_2d_labels( tuple, slice, list, - pd.Index, + "pd.Index", np.ndarray, ], item: Union[Scalar, AnyArrayLike, "SnowflakeQueryCompiler"], diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/base.py b/src/snowflake/snowpark/modin/plugin/docstrings/base.py index 8a620347d47..1dbd11221bc 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/base.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/base.py @@ -1463,8 +1463,9 @@ def iloc(): With a callable, useful in method chains. The `x` passed to the ``lambda`` is the DataFrame being sliced. This selects the rows whose index labels are even. + # TODO: SNOW-1372242: Remove instances of to_pandas when lazy index is implemented - >>> df.iloc[lambda x: x.index % 2 == 0] + >>> df.iloc[lambda x: x.index.to_pandas() % 2 == 0] a b c d 0 1 2 3 4 2 1000 2000 3000 4000 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index e3b7216d7e7..22d68216671 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -2922,8 +2922,9 @@ def squeeze(): Slicing a single row from a single column will produce a single scalar DataFrame: + # TODO: SNOW-1372242: Remove instances of to_pandas when lazy index is implemented - >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a = df.loc[df.index.to_pandas() < 1, ['a']] >>> df_0a a 0 1 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 52edcf98dfd..735fa2ad535 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -2425,8 +2425,9 @@ def squeeze(): Slicing a single row from a single column will produce a single scalar DataFrame: + # TODO: SNOW-1372242: Remove instances of to_pandas when lazy index is implemented - >>> df_0a = df.loc[df.index < 1, ['a']] + >>> df_0a = df.loc[df.index.to_pandas() < 1, ['a']] >>> df_0a a 0 1 diff --git a/src/snowflake/snowpark/modin/plugin/extensions/pd_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/pd_overrides.py index ae809aa7284..587f51b530d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/pd_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/pd_overrides.py @@ -40,6 +40,10 @@ if TYPE_CHECKING: # pragma: no cover import csv +from snowflake.snowpark.modin.plugin._internal.index import Index # noqa: F401 + +register_pd_accessor("Index")(Index) + @_inherit_docstrings(native_pd.read_csv, apilink="pandas.read_csv") @register_pd_accessor("read_csv") diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index 42435e2b699..f142083dac9 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -16,6 +16,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.conftest import running_on_public_ci from tests.integ.modin.series.test_bitwise_operators import try_cast_to_snow_series from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker @@ -1008,7 +1009,9 @@ def test_binary_arithmetic_ops_between_df_and_list_like_on_axis_1(op, rhs): def test_binary_div_between_series_and_list_like(op, rhs): lhs = [25, 2.5, 0.677, -3.33, -12] eval_snowpark_pandas_result( - *create_test_series(lhs), lambda df: getattr(df, op)(rhs), atol=0.001 + *create_test_series(lhs), + lambda df: getattr(df, op)(try_convert_index_to_native(rhs)), + atol=0.001, ) diff --git a/tests/integ/modin/conftest.py b/tests/integ/modin/conftest.py index ca4a8ce3edc..c7aa506d625 100644 --- a/tests/integ/modin/conftest.py +++ b/tests/integ/modin/conftest.py @@ -189,60 +189,70 @@ def create_multiindex_with_dt64tz_level() -> pd.MultiIndex: @pytest.fixture(scope="session") def indices_dict(): return { - "string": pd.Index([f"i-{i}" for i in range(INDEX_SAMPLE_SIZE)], dtype=object), - "int": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int16"), - "range": pd.RangeIndex(0, INDEX_SAMPLE_SIZE, 1), - "float": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float"), - "repeats": pd.Index([0, 0, 1, 1, 2, 2] * int(INDEX_SAMPLE_SIZE / 6)), - "bool-dtype": pd.Index(np.random.randn(INDEX_SAMPLE_SIZE) < 0), - "tuples": pd.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "string": pandas.Index( + [f"i-{i}" for i in range(INDEX_SAMPLE_SIZE)], dtype=object + ), + "int": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int16"), + "range": pandas.RangeIndex(0, INDEX_SAMPLE_SIZE, 1), + "float": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float"), + "repeats": pandas.Index([0, 0, 1, 1, 2, 2] * int(INDEX_SAMPLE_SIZE / 6)), + "bool-dtype": pandas.Index(np.random.randn(INDEX_SAMPLE_SIZE) < 0), + "tuples": pandas.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "multi": create_multiindex(), # NumericIndex is a pandas 2.x feature - "num_int64": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int64"), - "num_float64": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float64"), - "empty": pd.Index([]), - "bool-object": pd.Index([True, False] * HALF_INDEX_SAMPLE_SIZE, dtype=object), - "string-python": pd.Index( + "num_int64": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int64"), + "num_float64": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float64"), + "empty": pandas.Index([]), + "bool-object": pandas.Index( + [True, False] * HALF_INDEX_SAMPLE_SIZE, dtype=object + ), + "string-python": pandas.Index( pd.array( - pd.Index([f"i-{i}" for i in range(INDEX_SAMPLE_SIZE)], dtype=object), + pandas.Index( + [f"i-{i}" for i in range(INDEX_SAMPLE_SIZE)], dtype=object + ), dtype="string[python]", ) ), - "nullable_int": pd.Index(nullable_int_sample, dtype="Int64"), - "nullable_uint": pd.Index(nullable_int_sample, dtype="UInt16"), - "nullable_float": pd.Index(nullable_int_sample, dtype="Float32"), - "nullable_bool": pd.Index( + "nullable_int": pandas.Index(nullable_int_sample, dtype="Int64"), + "nullable_uint": pandas.Index(nullable_int_sample, dtype="UInt16"), + "nullable_float": pandas.Index(nullable_int_sample, dtype="Float32"), + "nullable_bool": pandas.Index( nullable_bool_sample.astype(bool), dtype="boolean", ), - "uint": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint"), - "uint-small": pd.Index([1, 2, 3], dtype="uint64"), - "timedelta": pd.timedelta_range( + "uint": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint"), + "uint-small": pandas.Index([1, 2, 3], dtype="uint64"), + "timedelta": pandas.timedelta_range( start="1 day", periods=INDEX_SAMPLE_SIZE, freq="D" ), "multi-with-dt64tz-level": create_multiindex_with_dt64tz_level(), # NumericIndex is a pandas 2.x feature - "num_int32": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int32"), - "num_int16": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int16"), - "num_int8": pd.Index(np.arange(INDEX_SAMPLE_SIZE)).astype("int8"), - "num_uint64": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint64"), - "num_uint32": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint32"), - "num_uint16": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint16"), - "num_uint8": pd.Index(np.arange(INDEX_SAMPLE_SIZE)).astype("uint8"), - "num_float32": pd.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float32"), - "categorical": pd.Index(list("abcde") * 20, dtype="category"), - "interval": pd.IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), - "complex64": pd.Index(np.arange(100)).astype("complex64"), - "complex128": pd.Index(np.arange(100)).astype("complex128"), - "period": pd.period_range( + "num_int32": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int32"), + "num_int16": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="int16"), + "num_int8": pandas.Index(np.arange(INDEX_SAMPLE_SIZE)).astype("int8"), + "num_uint64": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint64"), + "num_uint32": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint32"), + "num_uint16": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="uint16"), + "num_uint8": pandas.Index(np.arange(INDEX_SAMPLE_SIZE)).astype("uint8"), + "num_float32": pandas.Index(np.arange(INDEX_SAMPLE_SIZE), dtype="float32"), + "categorical": pandas.Index(list("abcde") * 20, dtype="category"), + "interval": pandas.IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), + "complex64": pandas.Index(np.arange(100)).astype("complex64"), + "complex128": pandas.Index(np.arange(100)).astype("complex128"), + "period": pandas.period_range( start=datetime(2000, 1, 1), periods=100, freq="D", name="period[B]" ), # failed due to no "datetime": pandas.DatetimeIndex( - pd.bdate_range(datetime(2000, 1, 1), periods=INDEX_SAMPLE_SIZE, freq="B") + pandas.bdate_range( + datetime(2000, 1, 1), periods=INDEX_SAMPLE_SIZE, freq="B" + ) ), "datetime-tz": pandas.DatetimeIndex( - pd.bdate_range(datetime(2000, 1, 1), periods=INDEX_SAMPLE_SIZE, freq="B"), + pandas.bdate_range( + datetime(2000, 1, 1), periods=INDEX_SAMPLE_SIZE, freq="B" + ), tz="US/Pacific", ), } diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py index 281f633f453..351a86ed481 100644 --- a/tests/integ/modin/frame/test_axis.py +++ b/tests/integ/modin/frame/test_axis.py @@ -8,12 +8,16 @@ import numpy as np import pandas as native_pd import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.plugin.utils.warning_message import WarningMessage from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import VALID_PANDAS_LABELS, eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + VALID_PANDAS_LABELS, + assert_index_equal, + eval_snowpark_pandas_result, +) def assert_axes_result_equal(snow_res, pd_res): @@ -126,7 +130,10 @@ def test_columns(test_df): def set_columns_func(df, labels): - df.columns = labels + if isinstance(df, pd.DataFrame): + df.columns = labels + else: + df.columns = try_convert_index_to_native(labels) return df.columns @@ -135,18 +142,23 @@ def set_columns_func(df, labels): [ ["a", "b"], [1.3, 2], - pd.Index([1.3, 2]), + native_pd.Index([1.3, 2]), [None, int], [(42, "test"), (1, 2, 3)], - pd.Index(["a", "b"]), + native_pd.Index(["a", "b"]), [("A",), ("B",)], [("A", "a", 1), ("B", "b", 1)], [["A", "a"], ["B", "b"]], - pd.MultiIndex.from_tuples([("A", "a"), ("B", "b")]), + native_pd.MultiIndex.from_tuples([("A", "a"), ("B", "b")]), ], ) @sql_count_checker(query_count=0) def test_set_columns(columns): + if isinstance(columns, native_pd.Index) and not isinstance( + columns, native_pd.MultiIndex + ): + columns = pd.Index(columns) + eval_snowpark_pandas_result( pd.DataFrame(test_dfs[0].copy()), test_dfs[0].copy(), @@ -241,7 +253,7 @@ def test_duplicate_labels_assignment(): # Duplicate between index and data label snow_df = pd.DataFrame( - {"b": [1, 2]}, index=pd.RangeIndex(start=4, stop=6, step=1, name="a") + {"b": [1, 2]}, index=native_pd.RangeIndex(start=4, stop=6, step=1, name="a") ) snow_df.columns = ["a"] assert snow_df.columns.tolist() == ["a"] diff --git a/tests/integ/modin/frame/test_getattr.py b/tests/integ/modin/frame/test_getattr.py index 9e7d0c2e848..db74dfa107e 100644 --- a/tests/integ/modin/frame/test_getattr.py +++ b/tests/integ/modin/frame/test_getattr.py @@ -7,11 +7,11 @@ import modin.pandas as pd import pandas as native_pd import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( + assert_index_equal, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, ) diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index f9eed74da13..1e91c5c3b38 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -12,6 +12,7 @@ from pandas import isna import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_frame_equal, @@ -29,10 +30,10 @@ [True] * 7, [False] * 7, np.array([True, True, False, False, False, True, True], dtype=bool), - pd.Index([True, True, False, False, False, True, True]), + native_pd.Index([True, True, False, False, False, True, True]), [True], [True, True, False, False, False, True, True, True], - pd.Index([], dtype=bool), + native_pd.Index([], dtype=bool), np.array([], dtype=bool), ], ) @@ -40,12 +41,16 @@ def test_df_getitem_with_boolean_list_like( key, default_index_snowpark_pandas_df, default_index_native_df ): # df[boolean list-like key] is the same as df.loc[:, boolean list-like key] + if isinstance(key, native_pd.Index): + key = pd.Index(key) + def get_helper(df): if isinstance(df, pd.DataFrame): return df[key] # If pandas df, adjust the length of the df and key since boolean keys need to be the same length as the axis. + _key = try_convert_index_to_native(key) _df = df.iloc[: len(key)] - _key = key[: _df.shape[1]] + _key = _key[: _df.shape[1]] return _df[_key] eval_snowpark_pandas_result( @@ -60,7 +65,7 @@ def get_helper(df): "key", [ [random.choice("ABCDEFG") for _ in range(random.randint(1, 20))], - pd.Index(random.choice("ABCDEFG") for _ in range(random.randint(1, 20))), + native_pd.Index(random.choice("ABCDEFG") for _ in range(random.randint(1, 20))), np.array([random.choice("ABCDEFG") for _ in range(random.randint(1, 20))]), ], ) @@ -68,8 +73,15 @@ def test_df_getitem_with_string_list_like( key, default_index_snowpark_pandas_df, default_index_native_df ): # df[string list-like key] is the same as df.loc[:, string list-like key] + if isinstance(key, native_pd.Index): + key = pd.Index(key) + def get_helper(df): - return df[key] + if isinstance(df, pd.DataFrame): + return df[key] + else: + _key = try_convert_index_to_native(key) + return df[_key] eval_snowpark_pandas_result( default_index_snowpark_pandas_df, @@ -83,14 +95,21 @@ def get_helper(df): "key", [ [random.choice(range(7)) for _ in range(random.randint(1, 20))], - pd.Index(random.choice(range(7)) for _ in range(random.randint(1, 20))), + native_pd.Index(random.choice(range(7)) for _ in range(random.randint(1, 20))), np.array([random.choice(range(7)) for _ in range(random.randint(1, 20))]), ], ) def test_df_getitem_with_int_list_like(key): # df[int list-like key] is the same as df.loc[:, int list-like key] + if isinstance(key, native_pd.Index): + key = pd.Index(key) + def get_helper(df): - return df[key] + if isinstance(df, pd.DataFrame): + return df[key] + else: + _key = try_convert_index_to_native(key) + return df[_key] # Generate a dict that maps from int -> list of random ints data = {i: [random.choice(range(10)) for _ in range(5)] for i in range(7)} @@ -170,7 +189,7 @@ def test_df_getitem_with_none_nan_columns(): eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df[key], + lambda df: df[try_convert_index_to_native(key)], expect_exception=True, expect_exception_type=ValueError, expect_exception_match="Cannot mask with non-boolean array containing NA / NaN values", @@ -180,7 +199,7 @@ def test_df_getitem_with_none_nan_columns(): @pytest.mark.parametrize( "key", [ - pd.Index(list(t), dtype="object") + native_pd.Index(list(t), dtype="object") for t in itertools.combinations(LABEL_COLLECTION, 2) # This combination is covered by test_df_getitem_with_none_nan_columns if list(t) != [None, np.nan] @@ -190,6 +209,8 @@ def test_df_getitem_with_none_nan_columns(): def test_df_getitem_with_labels_two_columns_with_index(key): snow_df = pd.DataFrame(DATA, columns=LABEL_COLLECTION) native_df = native_pd.DataFrame(DATA, columns=LABEL_COLLECTION) + if isinstance(key, native_pd.Index): + key = pd.Index(key) def helper(df): ans_df = df[key] diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index af47c042406..5246e5c4661 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -18,6 +18,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.conftest import running_on_public_ci from tests.integ.modin.frame.test_head_tail import eval_result_and_query_with_no_join from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker @@ -279,7 +280,9 @@ def test_df_iloc_get_row_input_snowpark_pandas_return_dataframe( eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, - lambda df: df.iloc[iloc_snowpark_pandas_input_map[key]], + lambda df: df.iloc[ + try_convert_index_to_native(iloc_snowpark_pandas_input_map[key]) + ], ) @@ -298,7 +301,9 @@ def eval_func(df): label = iloc_snowpark_pandas_input_map[key] # convert to native pandas because iloc_snowpandas_input_map[key] holds SnowPandas objects - if not isinstance(df, DataFrame) and isinstance(label, (Series, DataFrame)): + if not isinstance(df, DataFrame) and isinstance( + label, (Series, DataFrame, pd.Index) + ): label = label.to_pandas() return df.iloc[slice(None), label] @@ -339,7 +344,7 @@ def test_df_iloc_get_callable( eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, - lambda df: df.iloc[lambda x: x.index % 2 == 0], + lambda df: df.iloc[lambda x: try_convert_index_to_native(x.index) % 2 == 0], ) def test_func(df: DataFrame) -> Series: @@ -354,7 +359,9 @@ def test_func(df: DataFrame) -> Series: eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, - lambda df: df.iloc[(lambda x: x.index % 2 == 0, [2, 3])], + lambda df: df.iloc[ + lambda x: try_convert_index_to_native(x.index) % 2 == 0, [2, 3] + ], ) @@ -692,6 +699,8 @@ def iloc_helper(df): # Native pandas does not support iloc with Snowpark Series. _key = pd.Series(_key, dtype=bool) + if isinstance(_df, native_pd.DataFrame): + _key = try_convert_index_to_native(_key) return _df.iloc[_key] if axis == "row" else _df.iloc[:, _key] query_count = 2 if (key_type == "series" and axis == "col") else 1 @@ -918,6 +927,9 @@ def iloc_helper(df): elif key_type == "series" and isinstance(df, pd.DataFrame): # Native pandas does not support iloc with Snowpark Series. _key = pd.Series(_key, dtype=float if len(key) == 0 else None) + + if isinstance(df, native_pd.DataFrame): + _key = try_convert_index_to_native(_key) return df.iloc[_key] if axis == "row" else df.iloc[:, _key] query_count = 2 if (key_type == "series" and axis == "col") else 1 @@ -1250,8 +1262,8 @@ def test_df_iloc_get_invalid_slice_key_negative( np.nan, np.array(["this", "is", "an", "ndarray!"]), native_pd.Index(["index", "of", "strings"]), - pd.Index([]), - pd.Index([], dtype=str), + native_pd.Index([]), + native_pd.Index([], dtype=str), "string", "test", ["list", "of", "strings"], @@ -1266,6 +1278,8 @@ def test_df_iloc_get_non_numeric_key_negative( # Check whether invalid non-numeric keys passed in raise TypeError. list-like objects need to be numeric, scalar # keys can only be integers. Native pandas Series and DataFrames are invalid inputs. + if isinstance(key, native_pd.Index): + key = pd.Index(key) # General case fails with TypeError. error_msg = re.escape(f".iloc requires numeric indexers, got {key}") with pytest.raises(IndexError, match=error_msg): @@ -1770,13 +1784,15 @@ def test_df_iloc_set_with_row_key_slice_range(numeric_test_data_4x4, start, stop [ lambda l: list(l), lambda l: np.array(l), - lambda l: pd.Index([(tuple(t) if is_list_like(t) else t) for t in l]), + lambda l: native_pd.Index([(tuple(t) if is_list_like(t) else t) for t in l]), ], ) def test_df_iloc_set_with_row_key_list( numeric_test_data_4x4, row_pos, col_pos, item_values, list_convert ): row_pos = list_convert(row_pos) + if isinstance(row_pos, native_pd.Index): + row_pos = pd.Index(row_pos) expected_query_count = 1 expected_join_count = 2 if isinstance(item_values, int) else 3 @@ -1791,8 +1807,8 @@ def test_df_iloc_set_with_row_key_list( item_values, item_values, wrap_item="na", - wrap_row="na", - wrap_col="na", + wrap_row="index", + wrap_col="index", ) @@ -2228,6 +2244,9 @@ def wrap_key_as_expected_type(wrap_type, snow_pos, native_pos): elif wrap_type == "tuple": snow_key = tuple(snow_pos) native_key = tuple(native_pos) + elif wrap_type == "index": + snow_key = pd.Index(snow_pos) + native_key = native_pd.Index(native_pos) else: snow_key = snow_pos native_key = native_pos diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index c580a1857b2..6e1c0cf86b4 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -22,7 +22,7 @@ def snow_df(): return pd.DataFrame( {"col1": ["one", "two", "three"], "col2": ["abc", "pqr", "xyz"]}, - index=native_pd.Index([5, 1, 0]), + index=pd.Index([5, 1, 0]), ) @@ -315,7 +315,7 @@ def test_insert_empty_multiindex_frame(value): expected_df = native_pd.DataFrame( value, columns=["col3"], - index=pd.Index([(None, None)] * 4), + index=native_pd.Index([(None, None)] * 4), ) assert_snowpark_pandas_equal_to_pandas(snow_df, expected_df) @@ -730,8 +730,8 @@ def test_insert_with_unique_and_duplicate_index_values( snow_df1 = pd.DataFrame(data1, index=index) snow_df2 = pd.DataFrame(data2, index=other_index) - native_df1 = native_pd.DataFrame(data1, index=index) - native_df2 = native_pd.DataFrame(data2, index=other_index) + native_df1 = native_pd.DataFrame(data1, index=index.to_pandas()) + native_df2 = native_pd.DataFrame(data2, index=other_index.to_pandas()) def insert_op(df): df.insert( diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 7f2a2af18e1..f9d6a92297b 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -14,6 +14,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( @@ -910,11 +911,13 @@ def test_df_loc_set_list_like_row_key(row_key, key_type): ) # test case for df.loc[row_key] = item - def key_converter(key): + def key_converter(key, df): # Convert key to the required type. _key = key if key_type == "index": - _key = pd.Index(key) + _key = ( + pd.Index(key) if isinstance(df, pd.DataFrame) else native_pd.Index(key) + ) elif key_type == "array": _key = np.array(key) return _key @@ -924,15 +927,15 @@ def loc_set_helper(df): if (0 < len(row_key) != len(df)) and is_bool(row_key[0]): # pandas raises IndexError if length of like-like boolean row key is not equal to the number of rows. with pytest.raises(IndexError, match="Boolean index has wrong length"): - df.loc[key_converter(row_key)] = item - _row_key = key_converter(row_key + [False] * (len(df) - len(row_key)))[ - : len(df) - ] + df.loc[key_converter(row_key, df)] = item + _row_key = key_converter( + row_key + [False] * (len(df) - len(row_key)), df + )[: len(df)] else: - _row_key = key_converter(row_key) + _row_key = key_converter(row_key, df) df.loc[_row_key] = item else: - _row_key = key_converter(row_key) + _row_key = key_converter(row_key, df) df.loc[_row_key] = pd.DataFrame(item) with SqlCounter(query_count=1, join_count=expected_join_count): @@ -946,15 +949,15 @@ def loc_set_helper(df): if (0 < len(row_key) != len(df)) and is_bool(row_key[0]): # pandas raises IndexError if length of like-like boolean row key is not equal to the number of rows. with pytest.raises(IndexError, match="Boolean index has wrong length"): - df.loc[key_converter(row_key)] = item - _row_key = key_converter(row_key + [False] * (len(df) - len(row_key)))[ - : len(df) - ] + df.loc[key_converter(row_key, df)] = item + _row_key = key_converter( + row_key + [False] * (len(df) - len(row_key)), df + )[: len(df)] else: - _row_key = key_converter(row_key) + _row_key = key_converter(row_key, df) df.loc[_row_key, :] = item else: - _row_key = key_converter(row_key) + _row_key = key_converter(row_key, df) df.loc[_row_key, :] = pd.DataFrame(item) with SqlCounter(query_count=1, join_count=expected_join_count): @@ -1008,8 +1011,8 @@ def key_converter(key): def loc_set_helper(df): if isinstance(df, native_pd.DataFrame): with pytest.raises(KeyError, match="not in index"): - df.loc[row_key_with_oob] = item - df.loc[valid_row_key] = item + df.loc[try_convert_index_to_native(row_key_with_oob)] = item + df.loc[try_convert_index_to_native(valid_row_key)] = item else: _row_key_with_oob = ( pd.Series(row_key_with_oob) @@ -1024,8 +1027,11 @@ def loc_set_helper(df): def loc_set_helper(df): if isinstance(df, native_pd.DataFrame): with pytest.raises(KeyError, match="not in index"): - df.loc[row_key_with_oob, :] = item - df.loc[valid_row_key, :] = item + df.loc[try_convert_index_to_native(row_key_with_oob), :] = item + df.loc[ + try_convert_index_to_native(valid_row_key), + :, + ] = item else: _row_key_with_oob = ( pd.Series(row_key_with_oob) @@ -1086,7 +1092,11 @@ def key_converter(df): _row_key = row_key # Convert key to the required type. if key_type == "index": - _row_key = pd.Index(_row_key) + _row_key = ( + pd.Index(_row_key) + if isinstance(df, pd.DataFrame) + else native_pd.Index(_row_key) + ) elif key_type == "array": _row_key = np.array(_row_key) elif key_type == "series": @@ -1112,7 +1122,10 @@ def loc_set_helper(df): if isinstance(col_key, (list, native_pd.Series)) and 2 in col_key: # Set the new columns to NaN values to prevent assignment of byte values. df.loc[:, ["E", 1, "X", 2]] = np.nan - df.loc[row_key, col_key] = item + df.loc[ + row_key, + try_convert_index_to_native(col_key), + ] = item else: key = ( row_key, @@ -1156,7 +1169,11 @@ def key_converter(df): _row_key = row_key # Convert key to the required type. if key_type == "index": - _row_key = pd.Index(_row_key) + _row_key = ( + pd.Index(_row_key) + if isinstance(df, pd.DataFrame) + else native_pd.Index(_row_key) + ) elif key_type == "array": _row_key = np.array(_row_key) elif key_type == "series": @@ -1171,7 +1188,10 @@ def loc_set_helper(df): # convert row key to appropriate type row_key = key_converter(df) if isinstance(df, native_pd.DataFrame): - df.loc[row_key, col_key] = item + df.loc[ + try_convert_index_to_native(row_key), + try_convert_index_to_native(col_key), + ] = item else: key = ( row_key, @@ -1247,7 +1267,11 @@ def key_converter(df): _row_key = row_key # Convert key to the required type. if key_type == "index": - _row_key = pd.Index(_row_key) + _row_key = ( + pd.Index(_row_key) + if isinstance(df, pd.DataFrame) + else native_pd.Index(_row_key) + ) elif key_type == "ndarray": _row_key = np.array(_row_key) elif key_type == "series": @@ -1573,7 +1597,11 @@ def loc_helper(df): # Convert key to the required type. if key_type == "index": - _key = pd.Index(_key, dtype=bool) + _key = ( + pd.Index(_key, dtype=bool) + if isinstance(df, pd.DataFrame) + else native_pd.Index(_key, dtype=bool) + ) elif key_type == "ndarray": _key = np.array(_key, dtype=bool) elif key_type == "series": @@ -1916,7 +1944,11 @@ def loc_key_type_convert(key, is_snow_type, index_name=None): elif key_type == "index": # native pandas has a bug to overwrite loc result's index name to the key's index name # so for testing, we overwrite the index name to be the same as the index name in the main frame - return pd.Index(key.to_list(), name=index_name) + return ( + pd.Index(key.to_list(), name=index_name) + if is_snow_type + else native_pd.Index(key.to_list(), name=index_name) + ) # default index with SqlCounter(query_count=1, join_count=1): @@ -2492,6 +2524,8 @@ def perform_loc_set(df): item_ = pd.DataFrame(item_value) elif isinstance(item_value, native_pd.Series): item_ = pd.Series(item_value) + else: + item_ = try_convert_index_to_native(item_) if col_key is None: df.loc[row_key] = item_ else: @@ -2811,7 +2845,7 @@ def loc_set_helper(df): "X", ["B"], ["Y"], - pd.Index(["A"]), + native_pd.Index(["A"]), None, # Should enlarge dataframe and create new column named `None`. ], ) @@ -2836,6 +2870,9 @@ def test_df_loc_set_with_column_wise_list_like_item( snow_df = pd.DataFrame(native_df) native_item = item + if isinstance(col_key, native_pd.Index): + col_key = pd.Index(col_key) + def loc_set_helper(df): if isinstance(df, pd.DataFrame): df.loc[row_key, col_key] = item_to_type(item) @@ -2844,7 +2881,7 @@ def loc_set_helper(df): # convert ["B"] to "B" for comparison. df.loc[ row_key, (col_key[0] if convert_list_to_string else col_key) - ] = item_to_type(native_item) + ] = try_convert_index_to_native(item_to_type(native_item)) convert_list_to_string = False if ( @@ -3001,9 +3038,12 @@ def test_df_loc_set_with_row_wise_list_like_item( native_item = item def loc_set_helper(df): - df.loc[row_key, col_key] = item_to_type( - item if isinstance(df, pd.DataFrame) else native_item - ) + if isinstance(df, pd.DataFrame): + new_item = item_to_type(item) + df.loc[row_key, col_key] = new_item + else: + new_item = item_to_type(native_item) + df.loc[row_key, col_key] = try_convert_index_to_native(new_item) # Native pandas has different error messages depending on whether a column not present in the df is used in the # column key. @@ -3025,7 +3065,9 @@ def loc_set_helper(df): # Only native pandas raises an error if a column not present in df is used when item and column key lengths # don't match. with pytest.raises(ValueError, match=err_msg): - native_df.loc[row_key, col_key] = item_to_type(item) + native_df.loc[row_key, col_key] = try_convert_index_to_native( + item_to_type(item) + ) # Change item so that native pandas result matches expected Snowpark pandas result. native_item = ( item[: len(col_key)] @@ -3045,7 +3087,9 @@ def loc_set_helper(df): # ValueError: shape mismatch: value array of shape (4,) could not be broadcast to indexing result of shape (2,3) native_err_msg = re.escape("array") with pytest.raises(ValueError, match=native_err_msg): - native_df.loc[row_key, col_key] = item_to_type(item) + native_df.loc[row_key, col_key] = try_convert_index_to_native( + item_to_type(item) + ) with SqlCounter(query_count=0, join_count=0): snowpark_err_msg = ( "Must have equal len keys and value when setting with an iterable" @@ -3560,7 +3604,7 @@ def loc_set_helper(df): if isinstance(key, native_pd.Series) and isinstance(df, pd.DataFrame): df.loc[pd.Series(key)] = item else: - df.loc[key] = item + df.loc[try_convert_index_to_native(key)] = try_convert_index_to_native(item) eval_snowpark_pandas_result( default_index_snowpark_pandas_df, @@ -3590,7 +3634,7 @@ def loc_set_helper(df): if isinstance(key, native_pd.Series) and isinstance(df, pd.DataFrame): df.loc[pd.Series(key)] = item else: - df.loc[key] = item + df.loc[try_convert_index_to_native(key)] = item # CASE 1: type of Snowflake column matches item type: # The df should not change. @@ -3643,7 +3687,10 @@ def loc_set_helper(df): _key = key if isinstance(df, pd.DataFrame): _key = pd.Series(key) if isinstance(key, native_pd.Series) else key - df.loc[_key] = item + df.loc[_key] = item + else: + _key = try_convert_index_to_native(_key) + df.loc[_key] = try_convert_index_to_native(item) with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( @@ -3673,6 +3720,9 @@ def loc_set_helper(df): if isinstance(df, pd.DataFrame): _key = pd.Series(key) if isinstance(key, native_pd.Series) else key _item = pd.Series(item) if isinstance(item, native_pd.Series) else item + else: + _key = try_convert_index_to_native(key) + _item = try_convert_index_to_native(item) df.loc[_key] = _item with SqlCounter(query_count=0, join_count=0): diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 1fc2d725b73..003cb2015e7 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -633,7 +633,7 @@ def test_dataframe_mask_with_dataframe_cond_single_index_different_names_2(): snow_other_df = pd.DataFrame(other, columns=["B"]) native_df = native_pd.DataFrame( - data, columns=["A"], index=pd.Index([1, 2, 3], name="B") + data, columns=["A"], index=native_pd.Index([1, 2, 3], name="B") ) native_cond_df = native_pd.DataFrame(cond, columns=["A"]) native_other_df = native_pd.DataFrame(other, columns=["B"]) @@ -669,7 +669,7 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other): # index with duplicated value 2 index = pd.Index([2, 1, 2, 3], name="index") snow_df = pd.DataFrame({"A": data}, index=index) - native_df = native_pd.DataFrame({"A": data}, index=index) + native_df = native_pd.DataFrame({"A": data}, index=index.to_pandas()) native_cond = cond_frame native_cond.index = index @@ -714,7 +714,7 @@ def test_dataframe_mask_with_duplicated_index_unaligned(): # requires eager evaluation. expected_pandas = native_pd.DataFrame( {"A": [5, 6, 3, 3, 4, 5, 6, 5, 5, 7]}, - index=pd.Index([2, 2, 2, 2, 1, 2, 2, 2, 2, 3], name="index"), + index=native_pd.Index([2, 2, 2, 2, 1, 2, 2, 2, 2, 3], name="index"), ) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_res, expected_pandas diff --git a/tests/integ/modin/frame/test_name.py b/tests/integ/modin/frame/test_name.py index a769b7cf601..b23a3b26f0f 100644 --- a/tests/integ/modin/frame/test_name.py +++ b/tests/integ/modin/frame/test_name.py @@ -5,11 +5,11 @@ import modin.pandas as pd import pandas as native_pd import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( + assert_index_equal, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, ) @@ -43,4 +43,4 @@ def test_create_dataframe_from_object_with_name(sample): def test_create_dataframe_from_snowpark_pandas_series(): df = pd.DataFrame([[2, 3, 4], [5, 6, 7]], columns=["X", "Y", "Z"]) df = pd.DataFrame([df.X, df.iloc[:, 2]]) - assert_index_equal(df.index, pd.Index(["X", "Z"])) + assert_index_equal(df.index, native_pd.Index(["X", "Z"])) diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index c3b42b17861..2700268bba0 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -11,11 +11,14 @@ import pandas as native_pd import pytest from modin.pandas import DataFrame, Index, MultiIndex, Series -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import assert_frame_equal, eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + assert_frame_equal, + assert_index_equal, + eval_snowpark_pandas_result, +) class TestRename: @@ -81,7 +84,7 @@ def test_rename(self, snow_float_frame): with SqlCounter(query_count=1, join_count=1): renamed = df.rename(index={"foo": "foo2", "bar": "bar2"}) - assert_index_equal(renamed.index, Index(["foo2", "bar2"])) + assert_index_equal(renamed.index, native_pd.Index(["foo2", "bar2"])) # have to pass something with SqlCounter(query_count=0): @@ -91,19 +94,23 @@ def test_rename(self, snow_float_frame): # partial columns with SqlCounter(query_count=0): renamed = snow_float_frame.rename(columns={"C": "foo", "D": "bar"}) - assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"])) + assert_index_equal( + renamed.columns, native_pd.Index(["A", "B", "foo", "bar"]) + ) # other axis with SqlCounter(query_count=1, join_count=1): renamed = snow_float_frame.T.rename(index={"C": "foo", "D": "bar"}) - assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"])) + assert_index_equal(renamed.index, native_pd.Index(["A", "B", "foo", "bar"])) # index with name with SqlCounter(query_count=3, join_count=2): index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) - assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) + assert_index_equal( + renamed.index, native_pd.Index(["bar", "foo"], name="name") + ) assert renamed.index.name == renamer.index.name @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/frame/test_sample.py b/tests/integ/modin/frame/test_sample.py index ffa5ee63cd2..821743eaef9 100644 --- a/tests/integ/modin/frame/test_sample.py +++ b/tests/integ/modin/frame/test_sample.py @@ -5,10 +5,10 @@ import modin.pandas as pd import numpy as np import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import assert_index_equal @pytest.fixture(params=[True, False]) diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py index 20d85009a13..42b2c14d9b7 100644 --- a/tests/integ/modin/frame/test_set_index.py +++ b/tests/integ/modin/frame/test_set_index.py @@ -7,6 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -82,11 +83,13 @@ def test_set_index_multiindex_columns(snow_df): @sql_count_checker(query_count=2) def test_set_index_negative(snow_df, native_df): - index = native_pd.Index([1, 2]) + index = pd.Index([1, 2]) eval_snowpark_pandas_result( snow_df, native_df, - lambda df: df.set_index(index), + lambda df: df.set_index(try_convert_index_to_native(index)) + if isinstance(df, native_pd.DataFrame) + else df.set_index(index), expect_exception=True, expect_exception_match="Length mismatch: Expected 3 rows, received array of length 2", expect_exception_type=ValueError, @@ -112,10 +115,11 @@ def test_set_index_dup_column_name(): def test_set_index_names(snow_df): with SqlCounter(query_count=1): # Verify column names becomes index names. + # multi index, native pandas automatically used assert snow_df.set_index(["a", "b"]).index.names == ["a", "b"] # Verify name from input index is set. - index = native_pd.Index([1, 2, 0]) + index = pd.Index([1, 2, 0]) index.names = ["iname"] with SqlCounter(query_count=3, join_count=1): assert snow_df.set_index(index).index.names == ["iname"] @@ -202,7 +206,7 @@ def test_set_index_duplicate_label_in_keys(native_df, drop, append): "obj_type", [ pd.Series, - native_pd.Index, + pd.Index, np.array, list, lambda x: [list(x)], @@ -233,7 +237,7 @@ def test_set_index_pass_single_array(obj_type, drop, append, native_df): lambda df: df.set_index( key.to_pandas() if isinstance(df, native_pd.DataFrame) - and isinstance(key, pd.Series) + and isinstance(key, (pd.Series, pd.Index)) else key, drop=drop, append=append, @@ -245,7 +249,7 @@ def test_set_index_pass_single_array(obj_type, drop, append, native_df): "obj_type", [ pd.Series, - native_pd.Index, + pd.Index, np.array, list, lambda x: native_pd.MultiIndex.from_arrays([x]), @@ -256,7 +260,10 @@ def test_set_index_pass_arrays(obj_type, drop, append, native_df): array = ["one", "two", "three"] key = obj_type(array) keys = ["a", obj_type(array)] - native_keys = ["a", key.to_pandas() if isinstance(key, pd.Series) else key] + native_keys = [ + "a", + key.to_pandas() if isinstance(key, (pd.Series, pd.Index)) else key, + ] with SqlCounter(query_count=3, join_count=1): eval_snowpark_pandas_result( snow_df, @@ -273,7 +280,7 @@ def test_set_index_pass_arrays(obj_type, drop, append, native_df): "obj_type2", [ pd.Series, - native_pd.Index, + pd.Index, np.array, list, iter, @@ -284,7 +291,7 @@ def test_set_index_pass_arrays(obj_type, drop, append, native_df): "obj_type1", [ pd.Series, - native_pd.Index, + pd.Index, np.array, list, iter, @@ -298,8 +305,12 @@ def test_set_index_pass_arrays_duplicate(obj_type1, obj_type2, drop, append, nat keys = [obj_type1(array), obj_type2(array)] if obj_type1 == pd.Series: obj_type1 = native_pd.Series + elif obj_type1 == pd.Index: + obj_type1 = native_pd.Index if obj_type2 == pd.Series: obj_type2 = native_pd.Series + elif obj_type2 == pd.Index: + obj_type2 = native_pd.Index native_keys = [obj_type1(array), obj_type2(array)] eval_snowpark_pandas_result( snow_df, @@ -392,7 +403,7 @@ def test_set_index_raise_on_invalid_type_set_negative(keys, drop, append, native "obj_type", [ pd.Series, - native_pd.Index, + pd.Index, np.array, iter, lambda x: native_pd.MultiIndex.from_arrays([x]), @@ -406,6 +417,8 @@ def test_set_index_raise_on_len(length, obj_type, drop, append, native_df): key = obj_type(values) if obj_type == pd.Series: obj_type = native_pd.Series + elif obj_type == pd.Index: + obj_type = native_pd.Index native_key = obj_type(values) msg = "Length mismatch: Expected 3 rows, received array of length.*" diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 2be7a24fbcc..a98a7eaef7d 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -332,8 +332,8 @@ def setitem_helper(df): native_pd.Series([]), ["a", "c", "b"], # replace with different type native_pd.Series(["x", "y", "z"], index=[2, 0, 1]), - pd.RangeIndex(3), - pd.Index( + native_pd.RangeIndex(3), + native_pd.Index( [datetime.datetime.now(), datetime.datetime.now(), datetime.datetime.now()] ), ], @@ -446,8 +446,8 @@ def test_df_setitem_with_unique_and_duplicate_index_values( snow_df1 = pd.DataFrame(data1, index=index) snow_df2 = pd.DataFrame(data2, index=other_index) - native_df1 = native_pd.DataFrame(data1, index=index) - native_df2 = native_pd.DataFrame(data2, index=other_index) + native_df1 = native_pd.DataFrame(data1, index=index.to_pandas()) + native_df2 = native_pd.DataFrame(data2, index=other_index.to_pandas()) def setitem_op(df): df["foo2"] = ( diff --git a/tests/integ/modin/frame/test_sort_values.py b/tests/integ/modin/frame/test_sort_values.py index 896f73cd71d..6809408dadc 100644 --- a/tests/integ/modin/frame/test_sort_values.py +++ b/tests/integ/modin/frame/test_sort_values.py @@ -22,7 +22,7 @@ def native_df_simple(): "a": ["abc", " ", "", "ABC", "_", "XYZ"], "b": ["1", "10", "xyz", "0", "2", "abc"], }, - index=pd.Index([1, 2, 3, 4, 5, 6], name="ind"), + index=native_pd.Index([1, 2, 3, 4, 5, 6], name="ind"), ) diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 33ca2b75f3b..d3e9e5da60c 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -639,7 +639,7 @@ def test_dataframe_where_with_dataframe_cond_single_index_different_names_2(): snow_other_df = pd.DataFrame(other, columns=["B"]) native_df = native_pd.DataFrame( - data, columns=["A"], index=pd.Index([1, 2, 3], name="B") + data, columns=["A"], index=native_pd.Index([1, 2, 3], name="B") ) native_cond_df = native_pd.DataFrame(cond, columns=["A"]) native_other_df = native_pd.DataFrame(other, columns=["B"]) @@ -674,16 +674,17 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): data = [3, 4, 5, 2] # index with duplicated value 2 index = pd.Index([2, 1, 2, 3], name="index") + native_index = native_pd.Index([2, 1, 2, 3], name="index") snow_df = pd.DataFrame({"A": data}, index=index) - native_df = native_pd.DataFrame({"A": data}, index=index) + native_df = native_pd.DataFrame({"A": data}, index=native_index) native_cond = cond_frame - native_cond.index = index + native_cond.index = native_index snow_cond = pd.DataFrame(native_cond) if isinstance(other, native_pd.DataFrame): native_other = other - native_other.index = index + native_other.index = native_index snow_other = pd.DataFrame(native_other) else: native_other = other @@ -720,7 +721,7 @@ def test_dataframe_where_with_duplicated_index_unaligned(): # requires eager evaluation. expected_pandas = native_pd.DataFrame( {"A": [3, 3, 5, 6, 4, 5, 5, 5, 6, 2]}, - index=pd.Index([2, 2, 2, 2, 1, 2, 2, 2, 2, 3], name="index"), + index=native_pd.Index([2, 2, 2, 2, 1, 2, 2, 2, 2, 3], name="index"), ) assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( snow_res, expected_pandas diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index e16f4771e59..9f03f75fb7c 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -16,6 +16,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, @@ -848,7 +849,7 @@ def operation(df): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( operation(snow_df), native_pd.Series( - [None, None], index=pd.Index(["i0", "i1"], name="level_0") + [None, None], index=native_pd.Index(["i0", "i1"], name="level_0") ), ) else: @@ -967,7 +968,7 @@ def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): ) @pytest.mark.parametrize("index", [[2.0, np.nan, 2.0, 1.0], [np.nan] * 4]) def test_dropna(self, dropna, index): - pandas_index = pd.Index(index, name="index") + pandas_index = native_pd.Index(index, name="index") if dropna and pandas_index.isna().all(): pytest.xfail( reason="We drop all the rows, apply the UDTF, and try to " @@ -1123,7 +1124,10 @@ def test_grouping_series_by_external_by(self, func, dropna, group_keys, sort): eval_snowpark_pandas_result( *create_test_series([0, 1, 2], index=["a", "a", "b"]), lambda s: s.groupby( - s.index, dropna=dropna, group_keys=group_keys, sort=sort + try_convert_index_to_native(s.index), + dropna=dropna, + group_keys=group_keys, + sort=sort, ).apply(func), ) diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index 4c3e83c7c46..915d098cb90 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -414,7 +414,7 @@ def test_string_sum_with_all_nulls_in_group_produces_empty_string(): assert_snowpark_pandas_equal_to_pandas( snow_result, native_pd.DataFrame( - {"string_col": ["", "a"]}, index=pd.Index([0, 1], name="key_col") + {"string_col": ["", "a"]}, index=native_pd.Index([0, 1], name="key_col") ), ) diff --git a/tests/integ/modin/groupby/test_groupby_property.py b/tests/integ/modin/groupby/test_groupby_property.py index db3e714f4a6..0b6b73025ac 100644 --- a/tests/integ/modin/groupby/test_groupby_property.py +++ b/tests/integ/modin/groupby/test_groupby_property.py @@ -269,22 +269,40 @@ def test_groups_grouping_by_single_index_column_with_sort_false( PrettyDict( { 3.1: pd.MultiIndex.from_arrays( - [pd.Index([3.1], name="col3"), pd.Index([17.0], name="col4")] + [ + native_pd.Index([3.1], name="col3"), + native_pd.Index([17.0], name="col4"), + ] ), 8.0: pd.MultiIndex.from_arrays( - [pd.Index([8.0], name="col3"), pd.Index([3.0], name="col4")] + [ + native_pd.Index([8.0], name="col3"), + native_pd.Index([3.0], name="col4"), + ] ), 12.0: pd.MultiIndex.from_arrays( - [pd.Index([12.0], name="col3"), pd.Index([16.0], name="col4")] + [ + native_pd.Index([12.0], name="col3"), + native_pd.Index([16.0], name="col4"), + ] ), 10.0: pd.MultiIndex.from_arrays( - [pd.Index([10.0], name="col3"), pd.Index([15.0], name="col4")] + [ + native_pd.Index([10.0], name="col3"), + native_pd.Index([15.0], name="col4"), + ] ), 4.0: pd.MultiIndex.from_arrays( - [pd.Index([4.0], name="col3"), pd.Index([np.nan], name="col4")] + [ + native_pd.Index([4.0], name="col3"), + native_pd.Index([np.nan], name="col4"), + ] ), np.nan: pd.MultiIndex.from_arrays( - [pd.Index([np.nan], name="col3"), pd.Index([np.nan], name="col4")] + [ + native_pd.Index([np.nan], name="col3"), + native_pd.Index([np.nan], name="col4"), + ] ), } ), diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py index b41fdbaab36..7756f8b620a 100644 --- a/tests/integ/modin/groupby/test_groupby_series.py +++ b/tests/integ/modin/groupby/test_groupby_series.py @@ -38,7 +38,7 @@ def test_groupby_sort_false_multiindex_series(series_multi_numeric): # TODO (SNOW-890686): merge test_groupby_sort_false_multiindex_series and test_groupby_sort_multiindex_series # once Snowpark pandas is updated to align with pandas 2.0.x result = series_multi_numeric.groupby("b", sort=False).max() - expected = native_pd.Series([1, 5], index=pd.Index([2, 1], name="b")) + expected = native_pd.Series([1, 5], index=native_pd.Index([2, 1], name="b")) assert_snowpark_pandas_equal_to_pandas(result, expected, check_dtype=False) eval_snowpark_pandas_result( diff --git a/tests/integ/modin/io/test_read_snowflake.py b/tests/integ/modin/io/test_read_snowflake.py index 1ffc7d4a318..79d98dd71ef 100644 --- a/tests/integ/modin/io/test_read_snowflake.py +++ b/tests/integ/modin/io/test_read_snowflake.py @@ -8,7 +8,6 @@ import numpy as np import pandas as native_pd import pytest -from pandas.testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark._internal.analyzer.analyzer_utils import quote_name @@ -28,6 +27,7 @@ SEMI_STRUCTURED_TYPE_DATA, VALID_SNOWFLAKE_COLUMN_NAMES, assert_frame_equal, + assert_index_equal, assert_series_equal, create_table_with_type, ) diff --git a/tests/integ/modin/io/test_read_snowflake_select_query.py b/tests/integ/modin/io/test_read_snowflake_select_query.py index 29d9287cd31..a8f9ad9ebd1 100644 --- a/tests/integ/modin/io/test_read_snowflake_select_query.py +++ b/tests/integ/modin/io/test_read_snowflake_select_query.py @@ -244,7 +244,7 @@ def test_read_snowflake_query_with_index_col_and_columns_overlap(session): ) pdf = df.to_pandas() assert pdf.index.dtype == np.int64 - assert pdf.columns.equals(pd.Index(["col0", "index_col"])) + assert pdf.columns.equals(native_pd.Index(["col0", "index_col"])) assert pdf.index.name == "index_col" diff --git a/tests/integ/modin/pivot/test_pivot_dropna.py b/tests/integ/modin/pivot/test_pivot_dropna.py index 4da2bd1821f..46ac16dc575 100644 --- a/tests/integ/modin/pivot/test_pivot_dropna.py +++ b/tests/integ/modin/pivot/test_pivot_dropna.py @@ -150,7 +150,7 @@ def test_pivot_table_single_nuance_aggfuncs_dropna_and_null_data( ("mean", "E", "up"): [None, 1.0], } ) - native_df.index = pd.Index(["bar", "foo"], dtype="object", name="A") + native_df.index = native_pd.Index(["bar", "foo"], dtype="object", name="A") native_df.columns.names = [None, None, "C"] eval_snowpark_pandas_result( diff --git a/tests/integ/modin/pivot/test_pivot_fill_value.py b/tests/integ/modin/pivot/test_pivot_fill_value.py index 15ab645090e..71a4c1a9a39 100644 --- a/tests/integ/modin/pivot/test_pivot_fill_value.py +++ b/tests/integ/modin/pivot/test_pivot_fill_value.py @@ -167,7 +167,7 @@ def test_pivot_table_single_nuance_aggfuncs_fill_value_and_null_data( [TEST_AGGFUNC, TEST_VALUES, TEST_COLUMN_VALUES], names=[None, None] + TEST_COLUMNS, ), - index=pd.Index(["bar", "foo"], name=TEST_INDEX), + index=native_pd.Index(["bar", "foo"], name=TEST_INDEX), ) assert_frame_equal( diff --git a/tests/integ/modin/series/test_getattr.py b/tests/integ/modin/series/test_getattr.py index f97a734a819..66415711ad1 100644 --- a/tests/integ/modin/series/test_getattr.py +++ b/tests/integ/modin/series/test_getattr.py @@ -7,11 +7,10 @@ import modin.pandas as pd import pandas as native_pd import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import SqlCounter -from tests.integ.modin.utils import assert_series_equal +from tests.integ.modin.utils import assert_index_equal, assert_series_equal @pytest.mark.parametrize( diff --git a/tests/integ/modin/series/test_getitem.py b/tests/integ/modin/series/test_getitem.py index 8d2564571b2..b71b7c0dbf6 100644 --- a/tests/integ/modin/series/test_getitem.py +++ b/tests/integ/modin/series/test_getitem.py @@ -9,6 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -20,10 +21,10 @@ [True] * 7, [False] * 7, np.array([True, True, False, False, False, True, True], dtype=bool), - pd.Index([True, True, False, False, False, True, True]), + native_pd.Index([True, True, False, False, False, True, True]), [True], [True, True, False, False, False, True, True, True], - pd.Index([], dtype=bool), + native_pd.Index([], dtype=bool), np.array([], dtype=bool), ], ) @@ -31,12 +32,15 @@ def test_series_getitem_with_boolean_list_like( key, default_index_snowpark_pandas_series, default_index_native_series ): + if isinstance(key, native_pd.Index): + key = pd.Index(key) + def getitem_helper(ser): # Native pandas can only handle boolean list-likes objects of length = num(rows). if isinstance(ser, native_pd.Series): # If native pandas Series, truncate the series and key. _ser = ser[: len(key)] - _key = key[: len(_ser)] + _key = try_convert_index_to_native(key)[: len(_ser)] else: _key, _ser = key, ser return _ser[_key] @@ -141,7 +145,7 @@ def test_series_getitem_with_empty_keys( eval_snowpark_pandas_result( default_index_snowpark_pandas_series, default_index_native_series, - lambda ser: ser[key], + lambda ser: ser[try_convert_index_to_native(key)], ) diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index 0e7c572a2fa..245c8aa776f 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -14,6 +14,7 @@ from pandas.errors import IndexingError import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.frame.test_iloc import snowpark_pandas_input_keys from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( @@ -59,7 +60,9 @@ def test_series_iloc_snowpark_pandas_input_return_dataframe( eval_snowpark_pandas_result( default_index_snowpark_pandas_series, default_index_native_series, - lambda ser: ser.iloc[iloc_snowpark_pandas_input_map[key]], + lambda ser: ser.iloc[ + try_convert_index_to_native(iloc_snowpark_pandas_input_map[key]) + ], ) @@ -174,11 +177,19 @@ def iloc_helper(ser): # Convert key to the required type. if key_type == "index": - _key = pd.Index(_key, dtype=bool) + _key = ( + pd.Index(_key, dtype=bool) + if isinstance(_ser, pd.Series) + else native_pd.Index(_key, dtype=bool) + ) elif key_type == "ndarray": _key = np.array(_key) elif key_type == "index with name": - _key = pd.Index(_key, name="some name", dtype=bool) + _key = ( + pd.Index(_key, name="some name", dtype=bool) + if isinstance(_ser, pd.Series) + else native_pd.Index(_key, name="some name", dtype=bool) + ) elif key_type == "series" and isinstance(_ser, pd.Series): # Native pandas does not support iloc with Snowpark Series. _key = pd.Series(_key, dtype=bool) @@ -251,11 +262,17 @@ def iloc_helper(ser): # Convert key to the required type. if key_type == "index": - _key = pd.Index(_key) + _key = ( + pd.Index(_key) if isinstance(ser, pd.Series) else native_pd.Index(_key) + ) elif key_type == "ndarray": _key = np.array(_key) elif key_type == "index with name": - _key = pd.Index(_key, name="some name") + _key = ( + pd.Index(_key, name="some name") + if isinstance(ser, pd.Series) + else native_pd.Index(_key, name="some name") + ) elif key_type == "series" and isinstance(ser, pd.Series): # Native pandas does not support iloc with Snowpark Series. _key = pd.Series(_key, dtype=float if len(key) == 0 else None) @@ -443,8 +460,8 @@ def test_series_iloc_get_invalid_slice_key_negative( np.nan, np.array(["this", "is", "an", "ndarray!"]), native_pd.Index(["index", "of", "strings"]), - pd.Index([]), - pd.Index([], dtype=str), + native_pd.Index([]), + native_pd.Index([], dtype=str), "string", "test", ["list", "of", "strings"], @@ -456,6 +473,8 @@ def test_series_iloc_get_non_numeric_key_negative(key, default_index_native_int_ # Check whether invalid non-numeric keys passed in raise TypeError. # list-like objects need to be numeric, scalar keys can only be integers. # Native pandas Series and DataFrames are invalid inputs. + if isinstance(key, native_pd.Index): + key = pd.Index(key) snowpark_index_int_series = pd.Series(default_index_native_int_series) error_msg = re.escape(f".iloc requires numeric indexers, got {key}") with pytest.raises(IndexError, match=error_msg): diff --git a/tests/integ/modin/series/test_isin.py b/tests/integ/modin/series/test_isin.py index 5b1f30a3480..a4d3ba90278 100644 --- a/tests/integ/modin/series/test_isin.py +++ b/tests/integ/modin/series/test_isin.py @@ -9,6 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, @@ -64,10 +65,12 @@ def _test_isin_with_snowflake_logic(s, values): # (native_pd.Series(index=["A", "B"]), 1), # not supported anymore because of index type mismatch (native_pd.Series([None, -10]), 5), (native_pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), 5), - (pd.Index([4, 5, 6]), 3), + (native_pd.Index([4, 5, 6]), 3), ], ) def test_isin_integer_data(values, expected_query_count): + if isinstance(values, native_pd.Index): + values = pd.Index(values) data = [3, 4, 2, 1, None, 0, 5, 4, 2, -10, -20, -42, None] with SqlCounter(query_count=expected_query_count): snow_series = pd.Series(data) @@ -76,7 +79,9 @@ def test_isin_integer_data(values, expected_query_count): eval_snowpark_pandas_result( snow_series, native_series, - lambda s: _test_isin_with_snowflake_logic(s, values), + lambda s: _test_isin_with_snowflake_logic( + s, try_convert_index_to_native(values) + ), ) diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 6c994b8025a..b45515bf266 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -15,6 +15,7 @@ from pandas.errors import IndexingError import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.frame.test_loc import ( diff2native_negative_row_inputs, negative_snowpark_pandas_input_keys, @@ -31,7 +32,7 @@ EMPTY_LIST_LIKE_VALUES = [ [], - pd.Index([]), + native_pd.Index([]), np.array([]), native_pd.Series([]), ] @@ -298,7 +299,11 @@ def loc_helper(ser): # Convert key to the required type. if key_type == "index": - _key = pd.Index(_key, dtype=bool) + _key = ( + pd.Index(_key, dtype=bool) + if isinstance(_ser, pd.Series) + else native_pd.Index(_key, dtype=bool) + ) elif key_type == "ndarray": _key = np.array(_key, dtype=bool) elif key_type == "series": @@ -457,7 +462,11 @@ def type_convert(key, is_snow_type): elif key_type == "index": # native pandas has a bug to overwrite loc result's index name to the key's index name # so for testing, we overwrite the index name to be the same as the index name in the main frame - return pd.Index(key.to_list(), name=index_name) + return ( + pd.Index(key.to_list(), name=index_name) + if is_snow_type + else native_pd.Index(key.to_list(), name=index_name) + ) return s.loc[type_convert(native_series_key, isinstance(s, pd.Series))] @@ -755,7 +764,9 @@ def loc_set_helper(s): else _row_key ) _item = pd.Series(_item) if isinstance(_item, native_pd.Series) else _item - + else: + _row_key = try_convert_index_to_native(_row_key) + _item = try_convert_index_to_native(_item) s.loc[_row_key] = _item with SqlCounter(query_count=1, join_count=expected_join_count): @@ -786,7 +797,11 @@ def key_converter(s): _row_key = row_key # Convert key to the required type. if key_type == "index": - _row_key = pd.Index(_row_key) + _row_key = ( + pd.Index(_row_key) + if isinstance(s, pd.Series) + else native_pd.Index(_row_key) + ) elif key_type == "ndarray": _row_key = np.array(_row_key) elif key_type == "series": @@ -800,7 +815,7 @@ def key_converter(s): def loc_set_helper(s): row_key = key_converter(s) if isinstance(s, native_pd.Series): - s.loc[row_key] = item + s.loc[try_convert_index_to_native(row_key)] = item else: s.loc[pd.Series(row_key)] = pd.DataFrame(item) @@ -1016,7 +1031,9 @@ def test_series_loc_set_with_empty_key_and_empty_item_negative( err_msg = "The length of the value/item to set is empty" with pytest.raises(ValueError, match=err_msg): - native_ser.loc[key] = item + native_ser.loc[try_convert_index_to_native(key)] = try_convert_index_to_native( + item + ) snowpark_ser.loc[ pd.Series(key) if isinstance(key, native_pd.Series) else key ] = item @@ -1047,7 +1064,7 @@ def test_series_loc_set_with_empty_key_and_empty_series_item( snowpark_ser = pd.Series(native_ser) item = native_pd.Series([]) - native_ser.loc[key] = item + native_ser.loc[try_convert_index_to_native(key)] = item snowpark_ser.loc[ pd.Series(key) if isinstance(key, native_pd.Series) else key ] = pd.Series(item) @@ -1088,7 +1105,7 @@ def test_series_loc_set_with_empty_key_and_scalar_item( snowpark_ser = pd.Series(native_ser) item = 32 - native_ser.loc[key] = item + native_ser.loc[try_convert_index_to_native(key)] = item snowpark_ser.loc[ pd.Series(key) if isinstance(key, native_pd.Series) else key ] = item @@ -1171,7 +1188,7 @@ def test_series_loc_set_with_empty_key_and_series_item( native_ser = default_index_native_series.copy() snowpark_ser = pd.Series(native_ser) - native_ser.loc[key] = item + native_ser.loc[try_convert_index_to_native(key)] = item snowpark_ser.loc[pd.Series(key) if isinstance(key, native_pd.Series) else key] = ( pd.Series(item) if isinstance(item, native_pd.Series) else item ) diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index 0a7a9162662..165bca79061 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -120,11 +120,14 @@ def test_series_mask_with_series_cond_single_index_different_names(): cond = [False, True, False] snow_ser = pd.Series(data, index=pd.Index(["a", "b", "c"], name="Y")) - native_ser = native_pd.Series(data, index=pd.Index(["a", "b", "c"], name="Y")) + native_ser = native_pd.Series( + data, index=native_pd.Index(["a", "b", "c"], name="Y") + ) cond_snow_ser = pd.Series(cond, index=pd.Index(["a", "b", "c"], name="X")) - cond_native_ser = native_pd.Series(cond, index=pd.Index(["a", "b", "c"], name="X")) - + cond_native_ser = native_pd.Series( + cond, index=native_pd.Index(["a", "b", "c"], name="X") + ) eval_snowpark_pandas_result( snow_ser, native_ser, @@ -140,12 +143,13 @@ def test_series_mask_with_duplicated_index_aligned(): data = [1, 2, 3] cond = [False, True, False] index = pd.Index(["a", "a", "c"], name="index") + native_index = native_pd.Index(["a", "a", "c"], name="index") snow_ser = pd.Series(data, index=index) - native_ser = native_pd.Series(data, index=index) + native_ser = native_pd.Series(data, index=native_index) cond_snow_ser = pd.Series(cond, index=index) - cond_native_ser = native_pd.Series(cond, index=index) + cond_native_ser = native_pd.Series(cond, index=native_index) eval_snowpark_pandas_result( snow_ser, diff --git a/tests/integ/modin/series/test_rename.py b/tests/integ/modin/series/test_rename.py index 7766b5f8800..e0850cf891c 100644 --- a/tests/integ/modin/series/test_rename.py +++ b/tests/integ/modin/series/test_rename.py @@ -8,15 +8,15 @@ import modin.pandas as pd import numpy as np +import pandas as native_pd import pandas._testing as tm import pytest from modin.pandas import Index, MultiIndex, Series -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.conftest import running_on_public_ci from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import assert_series_equal +from tests.integ.modin.utils import assert_index_equal, assert_series_equal class TestRename: @@ -50,13 +50,15 @@ def test_rename_partial_dict(self): # partial dict ser = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") renamed = ser.rename({"b": "foo", "d": "bar"}) - assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"])) + assert_index_equal(renamed.index, native_pd.Index(["a", "foo", "c", "bar"])) @sql_count_checker(query_count=2, join_count=1) def test_rename_retain_index_name(self): # index with name renamer = Series( - np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64" + np.arange(4), + index=pd.Index(["a", "b", "c", "d"], name="name"), + dtype="int64", ) renamed = renamer.rename({}) assert renamed.index.name == renamer.index.name diff --git a/tests/integ/modin/series/test_sample.py b/tests/integ/modin/series/test_sample.py index 39ae6d460d8..383d5ca23c1 100644 --- a/tests/integ/modin/series/test_sample.py +++ b/tests/integ/modin/series/test_sample.py @@ -5,11 +5,11 @@ import modin.pandas as pd import pandas as native_pd import pytest -from pandas._testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( + assert_index_equal, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, ) diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index 8941c2b8a18..2647ba57447 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -13,6 +13,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_series_equal, @@ -997,7 +998,7 @@ def test_series_setitem_array_like_key_and_scalar_item_mixed_types( snowpark_ser = pd.Series(native_ser) # Assign item. - native_ser[key] = item + native_ser[try_convert_index_to_native(key)] = item snowpark_ser[key] = item err_msg = "Series are different" @@ -1550,7 +1551,7 @@ def test_series_setitem_with_empty_key_and_empty_item_negative( err_msg = "The length of the value/item to set is empty" with pytest.raises(ValueError, match=err_msg): - native_ser[key] = item + native_ser[try_convert_index_to_native(key)] = item snowpark_ser[ pd.Series(key) if isinstance(key, native_pd.Series) else key ] = item @@ -1581,7 +1582,7 @@ def test_series_setitem_with_empty_key_and_empty_series_item( snowpark_ser = pd.Series(native_ser) item = native_pd.Series([]) - native_ser[key] = item + native_ser[try_convert_index_to_native(key)] = item snowpark_ser[ pd.Series(key) if isinstance(key, native_pd.Series) else key ] = pd.Series(item) @@ -1622,7 +1623,7 @@ def test_series_setitem_with_empty_key_and_scalar_item( snowpark_ser = pd.Series(native_ser) item = 32 - native_ser[key] = item + native_ser[try_convert_index_to_native(key)] = item snowpark_ser[pd.Series(key) if isinstance(key, native_pd.Series) else key] = item assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snowpark_ser, native_ser) @@ -1657,7 +1658,7 @@ def test_series_setitem_with_empty_key_and_series_and_list_like_item_negative( "cannot set using a list-like indexer with a different length than the value" ) with pytest.raises(ValueError, match=err_msg): - native_ser[key] = item + native_ser[try_convert_index_to_native(key)] = item snowpark_ser[pd.Series(key) if isinstance(key, native_pd.Series) else key] = ( pd.Series(item) if isinstance(item, native_pd.Series) else item ) diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index c6de64a34f4..cc08ca9d09d 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -120,10 +120,14 @@ def test_series_where_with_series_cond_single_index_different_names(): cond = [False, True, False] snow_ser = pd.Series(data, index=pd.Index(["a", "b", "c"], name="Y")) - native_ser = native_pd.Series(data, index=pd.Index(["a", "b", "c"], name="Y")) + native_ser = native_pd.Series( + data, index=native_pd.Index(["a", "b", "c"], name="Y") + ) cond_snow_ser = pd.Series(cond, index=pd.Index(["a", "b", "c"], name="X")) - cond_native_ser = native_pd.Series(cond, index=pd.Index(["a", "b", "c"], name="X")) + cond_native_ser = native_pd.Series( + cond, index=native_pd.Index(["a", "b", "c"], name="X") + ) eval_snowpark_pandas_result( snow_ser, @@ -140,12 +144,13 @@ def test_series_where_with_duplicated_index_aligned(): data = [1, 2, 3] cond = [False, True, False] index = pd.Index(["a", "a", "c"], name="index") + native_index = native_pd.Index(["a", "a", "c"], name="index") snow_ser = pd.Series(data, index=index) - native_ser = native_pd.Series(data, index=index) + native_ser = native_pd.Series(data, index=native_index) cond_snow_ser = pd.Series(cond, index=index) - cond_native_ser = native_pd.Series(cond, index=index) + cond_native_ser = native_pd.Series(cond, index=native_index) eval_snowpark_pandas_result( snow_ser, diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index f61d5c34f34..361ba19b05c 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -9,13 +9,13 @@ import pandas as native_pd import pytest from pandas import Index, MultiIndex -from pandas.testing import assert_index_equal import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_frame_equal, + assert_index_equal, assert_series_equal, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 5813778b8ac..c8d32e6805c 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -10,7 +10,6 @@ import pandas as native_pd import pytest from pandas import DatetimeTZDtype -from pandas._testing import assert_index_equal from pandas.core.dtypes.common import is_datetime64_any_dtype import snowflake.snowpark @@ -26,6 +25,7 @@ BASIC_TYPE_DATA2, VALID_PANDAS_LABELS, assert_frame_equal, + assert_index_equal, assert_series_equal, assert_snowpark_pandas_equal_to_pandas, ) @@ -110,7 +110,7 @@ def check_result_from_and_to_pandas( Raises: AssertionError if the converted dataframe does not match with the original one """ - if columns is not None and not isinstance(columns, (list, pd.Index)): + if columns is not None and not isinstance(columns, (list, native_pd.Index)): columns = [columns] native_df = native_pd.DataFrame(data=data, index=index, columns=columns) snow_df = pd.DataFrame(native_df) @@ -235,7 +235,7 @@ def test_index_name(index_name): @sql_count_checker(query_count=1) def test_column_index_names(pandas_label): snow_df = pd.DataFrame({pandas_label: [1, 2]}) - expected_columns_index = pd.Index([pandas_label]) + expected_columns_index = native_pd.Index([pandas_label]) # verify columns is same as original dataframe. assert_index_equal(snow_df.columns, expected_columns_index) # convert back to native pandas and verify columns is same as the original dataframe @@ -246,9 +246,7 @@ def test_column_index_names(pandas_label): @pytest.mark.parametrize("name", [None, *VALID_PANDAS_LABELS]) @sql_count_checker(query_count=1) def test_to_pandas_column_index_names(name): - df = pd.DataFrame( - data=[[1] * 2, [2] * 2], columns=native_pd.Index([1, 2], name=name) - ) + df = pd.DataFrame(data=[[1] * 2, [2] * 2], columns=pd.Index([1, 2], name=name)) assert df.columns.names == [name] pdf = df.to_pandas() assert pdf.columns.names == [name] diff --git a/tests/integ/modin/test_utils.py b/tests/integ/modin/test_utils.py index 488fde8ffa1..d52eb0a6807 100644 --- a/tests/integ/modin/test_utils.py +++ b/tests/integ/modin/test_utils.py @@ -1,17 +1,25 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import numpy as np +import pandas as native_pd import pytest +import snowflake.snowpark.modin.pandas as pd from snowflake.snowpark._internal.analyzer.analyzer_utils import quote_name from snowflake.snowpark._internal.utils import ( TempObjectType, random_name_for_temp_object, ) +from snowflake.snowpark.modin.pandas.utils import ( + ensure_index, + try_convert_index_to_native, +) from snowflake.snowpark.modin.plugin._internal.utils import ( create_ordered_dataframe_with_readonly_temp_table, ) -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import assert_index_equal @pytest.mark.parametrize("columns", [["A", "b", "C"], ['"a"', '"B"', '"c"']]) @@ -35,3 +43,100 @@ def test_create_snowpark_dataframe_with_readonly_temp_table(session, columns): assert [ row[0] for row in ordered_df.select(row_position_quoted_identifier).collect() ] == list(range(num_rows)) + + +INDEX_DATA = [ + [1, 2, 3], + ["a", "b", "c"], + [1, "a", -5, "abc"], + np.array([1, 2, 3]), + [[1, 2, 3], ["a", "b", "c"]], +] + + +@pytest.mark.parametrize("data", INDEX_DATA) +@sql_count_checker(query_count=0) +def test_assert_index_equal(data): + if isinstance(data[0], list): + a = pd.MultiIndex.from_arrays(data) + b = pd.MultiIndex.from_arrays(data) + c = pd.MultiIndex.from_arrays([[-1, 2, 3], ["a", "b", "c"]]) + else: + a = pd.Index(data) + b = pd.Index(data) + c = pd.Index([-1, 2, 3]) + assert_index_equal(a, b) + with pytest.raises(AssertionError): + assert_index_equal(a, c) + with pytest.raises(AssertionError): + assert_index_equal(b, c) + + +@pytest.mark.parametrize("data", INDEX_DATA) +@sql_count_checker(query_count=0) +def test_try_convert_to_native_index(data): + + # we only convert a snowpark pandas index to a native pandas index + data = try_convert_index_to_native(data) + assert isinstance(data, (list, np.ndarray)) + + if isinstance(data[0], list): + index = pd.MultiIndex.from_arrays(data) + index2 = pd.MultiIndex.from_arrays(data) + else: + index = pd.Index(data) + index2 = pd.Index(data) + index = try_convert_index_to_native(index) + assert isinstance(index, native_pd.Index) + assert_index_equal(index, index2) + + +def ensure_index_test_helper(data, qc): + with SqlCounter(query_count=qc): + new_data = ensure_index(data) + # if the given data is a list of lists, ensure_index should be converting to a multiindex + if isinstance(data[0], list): + assert isinstance(new_data, pd.MultiIndex) + assert_index_equal(new_data, pd.MultiIndex.from_arrays(data)) + # if the given data is a list of tuples, ensure_index should be converting to a multiindex + # this case would also apply if the given data was a multiindex + elif isinstance(data[0], tuple): + assert isinstance(new_data, pd.MultiIndex) + assert_index_equal( + new_data, pd.MultiIndex.from_arrays(data.values.to_numpy()) + ) + # otherwise, ensure_index should convert to a pd.Index + else: + assert isinstance(new_data, pd.Index) + assert_index_equal(new_data, native_pd.Index(data)) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + ["a", "b", "c"], + [1, "a", -5, "abc"], + np.array([1, 2, 3]), + native_pd.Index([1, 2, 3]), + native_pd.Index(["a", "b", "c"]), + native_pd.Series([1, 2, 3]), + [[1, 2, 3], ["a", "b", "c"]], + ], +) +def test_ensure_index(data): + qc = 0 + if isinstance(data, native_pd.MultiIndex): + data = data + elif isinstance(data, native_pd.Index): + # test on native_pd.Index + ensure_index_test_helper(data, 0) + # convert to pd.Index to test on pd.Index later + data = pd.Index(data) + elif isinstance(data, native_pd.Series): + # test on native_pd.Series + ensure_index_test_helper(data, 0) + # convert to pd.Series to test on pd.Series later + data = pd.Series(data) + qc = 6 + ensure_index_test_helper(data, qc) diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index 3c832ade15b..f3bd5948dda 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -1,6 +1,8 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +from __future__ import annotations + import datetime import json from collections import namedtuple @@ -14,13 +16,13 @@ import pytest from modin.pandas import DataFrame, Series from pandas import isna -from pandas._testing import assert_index_equal from pandas._typing import Scalar from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.inference import is_scalar import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame +from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.utils import SupportsPublicToPandas from snowflake.snowpark.session import Session from snowflake.snowpark.types import StructField, StructType @@ -180,7 +182,14 @@ def create_test_dfs(*args, **kwargs) -> tuple[pd.DataFrame, native_pd.DataFrame] the second element is a native pandas dataframe created by forwarding the arguments to the pandas dataframe constructor. """ - return (pd.DataFrame(*args, **kwargs), native_pd.DataFrame(*args, **kwargs)) + native_kw_args = kwargs.copy() + if "index" in native_kw_args: + native_kw_args["index"] = try_convert_index_to_native(native_kw_args["index"]) + if "columns" in native_kw_args: + native_kw_args["columns"] = try_convert_index_to_native( + native_kw_args["columns"] + ) + return (pd.DataFrame(*args, **kwargs), native_pd.DataFrame(*args, **native_kw_args)) def create_test_series(*args, **kwargs) -> tuple[pd.Series, native_pd.Series]: @@ -212,10 +221,10 @@ def try_to_load_json_string(value: Any) -> Any: def assert_snowpark_pandas_equal_to_pandas( - snow: Union[DataFrame, Series], - expected_pandas: Union[native_pd.DataFrame, native_pd.Series], + snow: DataFrame | Series, + expected_pandas: native_pd.DataFrame | native_pd.Series, *, - statement_params: Optional[dict[str, str]] = None, + statement_params: dict[str, str] | None = None, expected_index_type: str = None, expected_dtypes: list[str] = None, **kwargs: Any, @@ -265,8 +274,8 @@ def assert_snowpark_pandas_equal_to_pandas( def assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow: Union[DataFrame, Series], - native: Union[native_pd.DataFrame, native_pd.Series], + snow: DataFrame | Series, + native: native_pd.DataFrame | native_pd.Series, **kwargs, ) -> None: """ @@ -276,8 +285,8 @@ def assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( def assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( - snow: Union[DataFrame, Series], - native: Union[native_pd.DataFrame, native_pd.Series], + snow: DataFrame | Series, + native: native_pd.DataFrame | native_pd.Series, **kwargs, ) -> None: """ @@ -359,8 +368,8 @@ def eval_snowpark_pandas_result( comparator: Callable = assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, inplace: bool = False, expect_exception: bool = False, - expect_exception_type: Optional[type[Exception]] = None, - expect_exception_match: Optional[str] = None, + expect_exception_type: type[Exception] | None = None, + expect_exception_match: str | None = None, assert_exception_equal: bool = True, **kwargs: Any, ) -> None: @@ -467,6 +476,9 @@ def assert_values_equal( Returns: bool telling whether the values are equal. """ + expected = try_convert_index_to_native(expected) + actual = try_convert_index_to_native(actual) + if isinstance(expected, native_pd.DataFrame): assert isinstance( actual, native_pd.DataFrame @@ -659,7 +671,7 @@ def create_snow_df_with_table_and_data( def create_table_with_type( - session: "Session", name: str, schema: str, table_type: str = "temporary" + session: Session, name: str, schema: str, table_type: str = "temporary" ): session._run_query(f"create or replace {table_type} table {name} ({schema})") @@ -751,3 +763,60 @@ def get_snowpark_dataframe_quoted_identifiers( snowpark_dataframe: SnowparkDataFrame, ) -> list[str]: return [f.column_identifier.quoted_name for f in snowpark_dataframe.schema.fields] + + +def assert_index_equal( + left: pd.Index, + right: pd.Index, + exact: bool | str = "equiv", + check_names: bool = True, + check_exact: bool = True, + check_categorical: bool = True, + check_order: bool = True, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, + obj: str = "Index", +): + """ + Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Index with an int64 dtype as well. + check_names : bool, default True + Whether to check the names attribute. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_order : bool, default True + Whether to compare the order of index entries as well as their values. + If True, both indexes must contain the same elements, in the same order. + If False, both indexes must contain the same elements, but in any order. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + left = try_convert_index_to_native(left) + right = try_convert_index_to_native(right) + return native_pd._testing.assert_index_equal( + left, + right, + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + check_order=check_order, + rtol=rtol, + atol=atol, + obj=obj, + ) diff --git a/tests/unit/modin/test_class.py b/tests/unit/modin/test_class.py index d38dc94c208..a251f4bbfc9 100644 --- a/tests/unit/modin/test_class.py +++ b/tests/unit/modin/test_class.py @@ -6,6 +6,7 @@ import pandas as native_pd import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.modin.plugin._internal.index import Index def test_class_equivalence(): @@ -30,7 +31,7 @@ def test_class_equivalence(): assert pd.Float32Dtype is native_pd.Float32Dtype assert pd.Float64Dtype is native_pd.Float64Dtype assert pd.Grouper is native_pd.Grouper - assert pd.Index is native_pd.Index + assert pd.Index is Index assert pd.IndexSlice is native_pd.IndexSlice assert pd.Int8Dtype is native_pd.Int8Dtype assert pd.Int16Dtype is native_pd.Int16Dtype diff --git a/tests/unit/modin/test_internal_frame.py b/tests/unit/modin/test_internal_frame.py index 3d2eb863b4c..47e70d29e98 100644 --- a/tests/unit/modin/test_internal_frame.py +++ b/tests/unit/modin/test_internal_frame.py @@ -7,7 +7,6 @@ import pandas as pd import pytest -from pandas.testing import assert_index_equal from snowflake.snowpark._internal.analyzer.sort_expression import ( Ascending, @@ -29,7 +28,7 @@ StructField, StructType, ) -from tests.integ.modin.utils import VALID_PANDAS_LABELS +from tests.integ.modin.utils import VALID_PANDAS_LABELS, assert_index_equal class TestDataFrames: diff --git a/tests/unit/modin/test_type_annotations.py b/tests/unit/modin/test_type_annotations.py index 66abb774878..63a89d1dfbb 100644 --- a/tests/unit/modin/test_type_annotations.py +++ b/tests/unit/modin/test_type_annotations.py @@ -5,7 +5,6 @@ from typing import get_type_hints import modin.pandas as pd -import pandas import pytest import snowflake.snowpark.modin.plugin # noqa: F401 @@ -15,7 +14,7 @@ "method,type_hints", [ (pd.Series.empty.fget, {"return": bool}), - (pd.DataFrame.columns.fget, {"return": pandas.Index}), + (pd.DataFrame.columns.fget, {"return": pd.Index}), ], ) def test_properties_snow_1374293(method, type_hints): From ef71050e950ac05400de455c33c967d31bb970f7 Mon Sep 17 00:00:00 2001 From: Adam Ling Date: Fri, 31 May 2024 17:51:47 -0700 Subject: [PATCH 05/12] SNOW-1455291: fix dataframe creation of empty data with DateType (#1716) --- CHANGELOG.md | 1 + src/snowflake/snowpark/mock/_functions.py | 9 ++++---- tests/integ/test_dataframe.py | 26 +++++++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac93af15836..af9738ae113 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ #### Bug Fixes - Fixed a bug in convert_timezone that made the setting the source_timezone parameter return an error. +- Fixed a bug where creating DataFrame with empty data of type `DateType` raises `AttributeError`. ### Snowpark pandas API Updates diff --git a/src/snowflake/snowpark/mock/_functions.py b/src/snowflake/snowpark/mock/_functions.py index be496fddcf0..ae93401a63b 100644 --- a/src/snowflake/snowpark/mock/_functions.py +++ b/src/snowflake/snowpark/mock/_functions.py @@ -338,12 +338,11 @@ def mock_to_date( """ import dateutil.parser - fmt = [fmt] * len(column) if not isinstance(fmt, ColumnEmulator) else fmt + if not isinstance(fmt, ColumnEmulator): + fmt = ColumnEmulator([fmt] * len(column), index=column.index) - def convert_date(row): + def convert_date(data, _fmt): try: - _fmt = fmt[row.name] - data = row[0] auto_detect = _fmt is None or _fmt.lower() == "auto" date_format, _ = convert_snowflake_datetime_format( _fmt, default_format="%Y-%m-%d" @@ -399,7 +398,7 @@ def convert_date(row): else: SnowparkLocalTestingException.raise_from_error(exc) - res = column.to_frame().apply(convert_date, axis=1) + res = column.combine(fmt, convert_date) res.sf_type = ColumnType(DateType(), column.sf_type.nullable) return res diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index a4000a68c41..4b102359928 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -70,6 +70,7 @@ IntegerType, LongType, MapType, + NullType, ShortType, StringType, StructField, @@ -3983,3 +3984,28 @@ def test_dataframe_to_local_iterator_isolation(session): assert ( row_counter == ROW_NUMBER ), f"Expect {ROW_NUMBER} rows, Got {row_counter} instead" + + +def test_create_empty_dataframe(session): + schema = StructType( + [ + StructField("COL1", IntegerType()), + StructField("COL2", ByteType()), + StructField("COL3", ShortType()), + StructField("COL4", LongType()), + StructField("COL5", FloatType()), + StructField("COL6", DoubleType()), + StructField("COL7", DecimalType()), + StructField("COL8", BooleanType()), + StructField("COL9", BinaryType()), + StructField("COL10", VariantType()), + StructField("COL11", StringType()), + StructField("COL12", DateType()), + StructField("COL13", TimestampType()), + StructField("COL14", TimeType()), + StructField("COL15", TimestampType(TimestampTimeZone.NTZ)), + StructField("COL16", MapType()), + StructField("COL17", NullType()), + ] + ) + assert not session.create_dataframe(data=[], schema=schema).collect() From c26be8a6e4f6f26b339074e02850f6373e1f536e Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 31 May 2024 20:13:50 -0700 Subject: [PATCH 06/12] SNOW-1445360: Add Support for Series.str.get (#1714) --- CHANGELOG.md | 1 + .../modin/supported/series_str_supported.rst | 4 +- .../compiler/snowflake_query_compiler.py | 53 ++++++++++++++++++- .../modin/plugin/docstrings/series_utils.py | 41 +++++++++++++- tests/integ/modin/series/test_str_accessor.py | 23 ++++++++ tests/unit/modin/test_series_strings.py | 1 - 6 files changed, 118 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af9738ae113..a5e304b07d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ #### New Features - Added partial support for `DataFrame.pct_change` and `Series.pct_change` without the `freq` and `limit` parameters. +- Added support for `Series.str.get`. #### Bug Fixes diff --git a/docs/source/modin/supported/series_str_supported.rst b/docs/source/modin/supported/series_str_supported.rst index ccd88770061..79732758b08 100644 --- a/docs/source/modin/supported/series_str_supported.rst +++ b/docs/source/modin/supported/series_str_supported.rst @@ -44,7 +44,9 @@ the method in the left column. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``fullmatch`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``get`` | N | | +| ``get`` | P | ``N`` if the `i` parameter is set to a non-int | +| | | value. Also non-string data values such as list | +| | | and dict are not yet supported. | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``get_dummies`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 481af458bf7..aa8ebc30f99 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -12284,8 +12284,57 @@ def output_col(col_name: ColumnOrName) -> SnowparkColumn: ) return SnowflakeQueryCompiler(new_internal_frame) - def str_get(self, i: int) -> None: - ErrorMessage.method_not_implemented_error("get", "Series.str") + def str_get(self, i: int) -> "SnowflakeQueryCompiler": + """ + Extract element from each component at specified position or with specified key. + + Extract element from lists, tuples, dict, or strings in each element in the Series/Index. + + Parameters + ---------- + i : int + Position or key of element to extract. + + Returns + ------- + SnowflakeQueryCompiler representing result of the string operation. + """ + if i is not None and not isinstance(i, int): + ErrorMessage.not_implemented( + "Snowpark pandas method 'Series.str.get' doesn't yet support non-numeric 'i' argument" + ) + + def output_col(col_name: ColumnOrName) -> SnowparkColumn: + col_len_exp = length(col(col_name)) + if i is None: + new_col = pandas_lit(None) + elif i < 0: + # Index is relative to the end boundary. + # If it falls before the beginning boundary, Null is returned. + # Note that string methods in pandas are 0-based while in Snowflake, they are 1-based. + new_col = iff( + pandas_lit(i) + col_len_exp < pandas_lit(0), + pandas_lit(None), + substring( + col(col_name), pandas_lit(i + 1) + col_len_exp, pandas_lit(1) + ), + ) + else: + assert i >= 0 + # Index is relative to the beginning boundary. + # If it falls after the end boundary, Null is returned. + # Note that string methods in pandas are 0-based while in Snowflake, they are 1-based. + new_col = iff( + pandas_lit(i) >= col_len_exp, + pandas_lit(None), + substring(col(col_name), pandas_lit(i + 1), pandas_lit(1)), + ) + return self._replace_non_str(col(col_name), new_col) + + new_internal_frame = self._modin_frame.apply_snowpark_function_to_data_columns( + output_col + ) + return SnowflakeQueryCompiler(new_internal_frame) def str_get_dummies(self, sep: str) -> None: ErrorMessage.method_not_implemented_error("get_dummies", "Series.str") diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py index 28940b6d6a5..cd03e57ca64 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py @@ -107,7 +107,46 @@ def rsplit(): pass def get(): - pass + """ + Extract element from each component at specified position or with specified key. + + Extract element from lists, tuples, dict, or strings in each element in the Series/Index. + + Parameters + ---------- + i : int + Position or key of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s.str.get(1) + 0 t + 1 None + 2 None + 3 None + 4 None + 5 None + dtype: object + + >>> s.str.get(-1) + 0 g + 1 None + 2 None + 3 None + 4 None + 5 None + dtype: object + """ def join(): pass diff --git a/tests/integ/modin/series/test_str_accessor.py b/tests/integ/modin/series/test_str_accessor.py index eb37febfbee..26749695a0b 100644 --- a/tests/integ/modin/series/test_str_accessor.py +++ b/tests/integ/modin/series/test_str_accessor.py @@ -159,6 +159,29 @@ def test_str_count(pat, flags): ) +@pytest.mark.parametrize("i", [None, -100, -2, -1, 0, 1, 2, 100]) +@sql_count_checker(query_count=1) +def test_str_get(i): + native_ser = native_pd.Series(TEST_DATA) + snow_ser = pd.Series(native_ser) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + lambda ser: ser.str.get(i=i), + ) + + +@sql_count_checker(query_count=0) +def test_str_get_neg(): + native_ser = native_pd.Series(TEST_DATA) + snow_ser = pd.Series(native_ser) + with pytest.raises( + NotImplementedError, + match="Snowpark pandas method 'Series.str.get' doesn't yet support non-numeric 'i' argument", + ): + snow_ser.str.get(i="a") + + @pytest.mark.parametrize("start", [None, -100, -2, -1, 0, 1, 2, 100]) @pytest.mark.parametrize("stop", [None, -100, -2, -1, 0, 1, 2, 100]) @pytest.mark.parametrize("step", [None, -100, -2, -1, 1, 2, 100]) diff --git a/tests/unit/modin/test_series_strings.py b/tests/unit/modin/test_series_strings.py index 285c1ce0ce9..f6f83ec6d43 100644 --- a/tests/unit/modin/test_series_strings.py +++ b/tests/unit/modin/test_series_strings.py @@ -34,7 +34,6 @@ def test_str_cat_no_others(mock_str_register, mock_series): (lambda s: s.str.decode("utf-8"), "decode"), (lambda s: s.str.encode("utf-8"), "encode"), (lambda s: s.str.rsplit("_", n=1), "rsplit"), - (lambda s: s.str.get(3), "get"), (lambda s: s.str.join("_"), "join"), (lambda s: s.str.pad(10), "pad"), (lambda s: s.str.center(10), "center"), From 3c4b2f8074e1d8fb89a3828a924e7e37f2c4c2d4 Mon Sep 17 00:00:00 2001 From: Andong Zhan Date: Mon, 3 Jun 2024 09:46:41 -0700 Subject: [PATCH 07/12] SNOW-1428607 Reenable quarantined tests after 8.20 (#1720) --- tests/integ/modin/frame/test_apply.py | 3 +-- tests/integ/modin/series/test_apply.py | 11 +---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index 8d71625f294..be5d49fcd3f 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -850,8 +850,7 @@ def test_apply_axis_1_frame_with_column_of_all_nulls_snow_1233832(null_value): [ (["scipy", "numpy"], 7), (["scipy>1.1", "numpy<2.0"], 7), - # TODO: SNOW-1428607 Re-enable quarantined tests for 8.20 - # [scipy, np], 9), + ([scipy, np], 9), ], ) def test_apply_axis1_with_3rd_party_libraries_and_decorator( diff --git a/tests/integ/modin/series/test_apply.py b/tests/integ/modin/series/test_apply.py index 1c396cf3dcd..7a0507c445e 100644 --- a/tests/integ/modin/series/test_apply.py +++ b/tests/integ/modin/series/test_apply.py @@ -507,13 +507,7 @@ def f(x): @pytest.mark.parametrize( "package,expected_query_count", - [ - ("scipy", 7), - ("scipy>=1.0", 7), - ("scipy<1.12.0", 7), - # TODO: SNOW-1428607 Re-enable quarantined tests for 8.20 - # (scipy, 9) - ], + [("scipy", 7), ("scipy>=1.0", 7), ("scipy<1.12.0", 7), (scipy, 9)], ) def test_3rd_party_package_with_udf_annotation(package, expected_query_count): @@ -607,9 +601,6 @@ def func(nsample): ) -@pytest.mark.xfail( - reason="TODO: SNOW-1428607 Re-enable quarantined tests for 8.20", -) @pytest.mark.parametrize("udf_packages,session_packages", [(["pandas", np], [scipy])]) @sql_count_checker(query_count=5, join_count=2, udf_count=1) def test_3rd_party_package_mix_and_match(udf_packages, session_packages): From d78e340d411ef5ccce2824c50b5690164a9bc4de Mon Sep 17 00:00:00 2001 From: Adam Ling Date: Mon, 3 Jun 2024 11:13:49 -0700 Subject: [PATCH 08/12] SNOW-1427770 local testing table merge fails when update clause exists but no update takes place (#1719) --- CHANGELOG.md | 1 + src/snowflake/snowpark/mock/_plan.py | 12 ++++++-- .../scala/test_update_delete_merge_suite.py | 29 +++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5e304b07d9..3b9ecb2f430 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - Fixed a bug in convert_timezone that made the setting the source_timezone parameter return an error. - Fixed a bug where creating DataFrame with empty data of type `DateType` raises `AttributeError`. +- Fixed a bug that table merge fails when update clause exists but no update takes place. ### Snowpark pandas API Updates diff --git a/src/snowflake/snowpark/mock/_plan.py b/src/snowflake/snowpark/mock/_plan.py index 9f4f379d397..cb84fd845f7 100644 --- a/src/snowflake/snowpark/mock/_plan.py +++ b/src/snowflake/snowpark/mock/_plan.py @@ -1358,11 +1358,15 @@ def outer_join(base_df): # (2) A target row is selected to be both updated and deleted inserted_rows = [] + insert_clause_specified = ( + update_clause_specified + ) = delete_clause_specified = False inserted_row_idx = set() # source_row_id deleted_row_idx = set() updated_row_idx = set() for clause in source_plan.clauses: if isinstance(clause, UpdateMergeExpression): + update_clause_specified = True # Select rows to update if clause.condition: condition = calculate_expression( @@ -1393,6 +1397,7 @@ def outer_join(base_df): updated_row_idx.add(row[ROW_ID]) elif isinstance(clause, DeleteMergeExpression): + delete_clause_specified = True # Select rows to delete if clause.condition: condition = calculate_expression( @@ -1415,6 +1420,7 @@ def outer_join(base_df): target = target[~matched] elif isinstance(clause, InsertMergeExpression): + insert_clause_specified = True # calculate unmatched rows in the source matched = source.apply(tuple, 1).isin( join_result[source.columns].apply(tuple, 1) @@ -1499,11 +1505,11 @@ def outer_join(base_df): # Generate metadata result res = [] - if inserted_rows: + if insert_clause_specified: res.append(len(inserted_row_idx)) - if updated_row_idx: + if update_clause_specified: res.append(len(updated_row_idx)) - if deleted_row_idx: + if delete_clause_specified: res.append(len(deleted_row_idx)) return [Row(*res)] diff --git a/tests/integ/scala/test_update_delete_merge_suite.py b/tests/integ/scala/test_update_delete_merge_suite.py index ee5498a4884..5a0ec56940c 100644 --- a/tests/integ/scala/test_update_delete_merge_suite.py +++ b/tests/integ/scala/test_update_delete_merge_suite.py @@ -604,3 +604,32 @@ def test_update_with_join_involving_null_values(session): t1, [Row(1, "one", "un"), Row(2, "two", "deux")], ) + + +def test_merge_multi_operation(session): + target = session.create_dataframe([[1, "a"]], schema=["id", "name"]) + target.write.save_as_table("target", mode="overwrite") + target = session.table("target") + + source = session.create_dataframe([[2, "b"]], schema=["id", "name"]) + # update + insert + target.merge( + source, + target["id"] == source["id"], + [ + when_matched().update({"name": source["name"]}), + when_not_matched().insert({"id": source["id"], "name": source["name"]}), + ], + ) + assert target.sort(col("id")).collect() == [Row(1, "a"), Row(2, "b")] + + # delete + insert + target.merge( + source, + target["id"] == source["id"], + [ + when_matched().delete(), + when_not_matched().insert({"id": source["id"], "name": source["name"]}), + ], + ) + assert target.sort(col("id")).collect() == [Row(1, "a")] From 63074e3d59ccdbb3213598724e7f930851c6eb95 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 3 Jun 2024 13:30:59 -0700 Subject: [PATCH 09/12] SNOW-1367208: fix No stored procedure will run in a task unless pin snowflake-snowpark-python to an earlier version. (#1715) --- CHANGELOG.md | 5 ++++ .../snowpark/_internal/server_connection.py | 8 +++++- tests/unit/test_server_connection.py | 26 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b9ecb2f430..6e883a94090 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ #### Improvements + +#### Bug Fixes + +- Fixed a bug where python stored procedure with table return type fails when run in a task. + ### Snowpark Local Testing Updates #### New Features diff --git a/src/snowflake/snowpark/_internal/server_connection.py b/src/snowflake/snowpark/_internal/server_connection.py index 6eb10d56365..121df7839e1 100644 --- a/src/snowflake/snowpark/_internal/server_connection.py +++ b/src/snowflake/snowpark/_internal/server_connection.py @@ -390,6 +390,7 @@ def run_query( case_sensitive: bool = True, params: Optional[Sequence[Any]] = None, num_statements: Optional[int] = None, + ignore_results: bool = False, **kwargs, ) -> Union[Dict[str, Any], AsyncJob]: try: @@ -422,6 +423,8 @@ def run_query( # have non-select statements, and it shouldn't fail if the user # calls to_pandas() to execute the query. if block: + if ignore_results: + return {"data": None, "sfqid": results_cursor.sfqid} return self._to_data_or_iter( results_cursor=results_cursor, to_pandas=to_pandas, to_iter=to_iter ) @@ -541,6 +544,7 @@ def get_result_set( data_type: _AsyncResultType = _AsyncResultType.ROW, log_on_exception: bool = False, case_sensitive: bool = True, + ignore_results: bool = False, **kwargs, ) -> Tuple[ Dict[ @@ -592,6 +596,7 @@ def get_result_set( case_sensitive=case_sensitive, num_statements=len(plan.queries), params=params, + ignore_results=ignore_results, **kwargs, ) @@ -620,6 +625,7 @@ def get_result_set( log_on_exception=log_on_exception, case_sensitive=case_sensitive, params=query.params, + ignore_results=ignore_results, **kwargs, ) placeholders[query.query_id_place_holder] = ( @@ -656,7 +662,7 @@ def get_result_and_metadata( def get_result_query_id(self, plan: SnowflakePlan, **kwargs) -> str: # get the iterator such that the data is not fetched - result_set, _ = self.get_result_set(plan, to_iter=True, **kwargs) + result_set, _ = self.get_result_set(plan, ignore_results=True, **kwargs) return result_set["sfqid"] @_Decorator.wrap_exception diff --git a/tests/unit/test_server_connection.py b/tests/unit/test_server_connection.py index fa3832eeb8e..72ccb6f6c42 100644 --- a/tests/unit/test_server_connection.py +++ b/tests/unit/test_server_connection.py @@ -5,6 +5,7 @@ import io import logging from unittest import mock +from unittest.mock import MagicMock import pytest @@ -139,3 +140,28 @@ def test_get_result_set_exception(mock_server_connection): with mock.patch.object(mock_server_connection, "run_query", return_value=None): with pytest.raises(SnowparkSQLException, match="doesn't return a ResultSet"): mock_server_connection.get_result_set(fake_plan, block=False) + + +def test_run_query_when_ignore_results_true(mock_server_connection): + mock_cursor1 = MagicMock() + mock_cursor1.sfqid = "ignore_results is True" + + mock_server_connection.execute_and_notify_query_listener = MagicMock() + mock_server_connection.execute_and_notify_query_listener.return_value = mock_cursor1 + + mock_server_connection._to_data_or_iter = MagicMock() + mock_server_connection._to_data_or_iter.return_value = { + "sfqid": "ignore_results is False" + } + + result = mock_server_connection.run_query( + "select * from fake_table", ignore_results=True + ) + mock_server_connection._to_data_or_iter.assert_not_called() + assert "sfqid" in result and result["sfqid"] == "ignore_results is True" + + result = mock_server_connection.run_query( + "select * from fake_table", ignore_results=False + ) + mock_server_connection._to_data_or_iter.assert_called() + assert "sfqid" in result and result["sfqid"] == "ignore_results is False" From 29b3bba9a7bd13206221b6fc6ad9ed0fae853fe0 Mon Sep 17 00:00:00 2001 From: Naresh Kumar <113932371+sfc-gh-nkumar@users.noreply.github.com> Date: Mon, 3 Jun 2024 17:48:43 -0700 Subject: [PATCH 10/12] SNOW-1445337: Log warning when data is materialized during dataframe creation (#1717) --- src/snowflake/snowpark/modin/plugin/_internal/utils.py | 5 ++++- tests/integ/modin/io/test_read_snowflake.py | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 01230fda011..10786778a75 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -248,7 +248,10 @@ def _create_read_only_table( ctas_query = f"SELECT * FROM {table_name}" temp_table_name = random_name_for_temp_object(TempObjectType.TABLE) - _logger.debug(f"Materialize temporary table {temp_table_name} for {ctas_query}") + _logger.warning( + f"Data from source table/view '{table_name}' is being copied into a new " + f"temporary table '{temp_table_name}'. DataFrame creation might take some time." + ) statement_params = get_default_snowpark_pandas_statement_params() # record 1) original table name (which may not be an actual table) diff --git a/tests/integ/modin/io/test_read_snowflake.py b/tests/integ/modin/io/test_read_snowflake.py index 79d98dd71ef..cef0479300b 100644 --- a/tests/integ/modin/io/test_read_snowflake.py +++ b/tests/integ/modin/io/test_read_snowflake.py @@ -358,15 +358,16 @@ def test_read_snowflake_with_views( ).collect() table_name = view_name caplog.clear() - with caplog.at_level(logging.DEBUG): + with caplog.at_level(logging.WARNING): df = call_read_snowflake(table_name, as_query) assert df.columns.tolist() == ["COL1", "S"] + materialize_log = f"Data from source table/view '{table_name}' is being copied into a new temporary table" if table_type in ["view", "SECURE VIEW", "TEMP VIEW"]: # verify temporary table is materialized for view, secure view and temp view - assert "Materialize temporary table" in caplog.text + assert materialize_log in caplog.text else: # verify no temporary table is materialized for regular table - assert not ("Materialize temporary table" in caplog.text) + assert not (materialize_log in caplog.text) finally: if view_name: Utils.drop_view(session, view_name) From 903eee29597687ac64d4041c903c106d7a3e84a8 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Tue, 4 Jun 2024 10:01:18 -0700 Subject: [PATCH 11/12] SNOW-1231617: Support dayofweek, day_of_week, dayofyear, and day_of_year. (#1728) Signed-off-by: sfc-gh-mvashishtha --- CHANGELOG.md | 1 + .../modin/supported/series_dt_supported.rst | 8 +-- .../compiler/snowflake_query_compiler.py | 6 ++ .../modin/plugin/docstrings/series_utils.py | 47 ++++++++++++++ tests/integ/modin/series/test_dt_accessor.py | 61 ++++++++++++++++++- tests/integ/modin/test_unimplemented.py | 2 +- tests/unit/modin/test_series_dt.py | 2 - 7 files changed, 119 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e883a94090..afb4515f69d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ - Added partial support for `DataFrame.pct_change` and `Series.pct_change` without the `freq` and `limit` parameters. - Added support for `Series.str.get`. +- Added support for `Series.dt.dayofweek`, `Series.dt.day_of_week`, `Series.dt.dayofyear`, and `Series.dt.day_of_year`. #### Bug Fixes diff --git a/docs/source/modin/supported/series_dt_supported.rst b/docs/source/modin/supported/series_dt_supported.rst index fbf579d3052..897eaf81b11 100644 --- a/docs/source/modin/supported/series_dt_supported.rst +++ b/docs/source/modin/supported/series_dt_supported.rst @@ -36,19 +36,19 @@ the method in the left column. +-----------------------------+---------------------------------+----------------------------------------------------+ | ``nanosecond`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``day_of_week`` | N | | +| ``day_of_week`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``week`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``weekofyear`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``dayofweek`` | N | | +| ``dayofweek`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``weekday`` | N | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``dayofyear`` | N | | +| ``dayofyear`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ -| ``day_of_year`` | N | | +| ``day_of_year`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ | ``quarter`` | Y | | +-----------------------------+---------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index aa8ebc30f99..714648d949b 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -83,6 +83,7 @@ date_part, date_trunc, dayofmonth, + dayofyear, dense_rank, first_value, greatest, @@ -8842,6 +8843,11 @@ def dt_property(self, property_name: str) -> "SnowflakeQueryCompiler": "month": month, "year": year, "quarter": quarter, + "dayofyear": dayofyear, + # Use DAYOFWEEKISO for `dayofweek` so that the result doesn't + # depend on the Snowflake session's WEEK_START parameter. Subtract + # 1 to match pandas semantics. + "dayofweek": (lambda column: builtin("dayofweekiso")(col(column)) - 1), } property_function = dt_property_to_function_map.get(property_name) if not property_function: diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py index cd03e57ca64..ecb2ff90eef 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py @@ -1121,6 +1121,38 @@ def nanosecond(): @property def dayofweek(): + """ + The day of the week with Monday=0, Sunday=6. + + Return the day of the week. It is assumed the week starts on Monday, + which is denoted by 0, and ends on Sunday, which is denoted by 6. + + Examples + -------- + >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D') + >>> s + 0 2016-12-31 + 1 2017-01-01 + 2 2017-01-02 + 3 2017-01-03 + 4 2017-01-04 + 5 2017-01-05 + 6 2017-01-06 + 7 2017-01-07 + 8 2017-01-08 + dtype: datetime64[ns] + >>> s.dt.dayofweek + 0 5 + 1 6 + 2 0 + 3 1 + 4 2 + 5 3 + 6 4 + 7 5 + 8 6 + dtype: int64 + """ pass @property @@ -1129,6 +1161,21 @@ def weekday(): @property def dayofyear(): + """ + The ordinal day of the year. + + Examples + -------- + >>> s = pd.to_datetime(["1/1/2020", "2/1/2020"]) + >>> s + 0 2020-01-01 + 1 2020-02-01 + dtype: datetime64[ns] + >>> s.dt.dayofyear + 0 1 + 1 32 + dtype: int16 + """ pass @property diff --git a/tests/integ/modin/series/test_dt_accessor.py b/tests/integ/modin/series/test_dt_accessor.py index a63f752d4f7..d33dc93d0f7 100644 --- a/tests/integ/modin/series/test_dt_accessor.py +++ b/tests/integ/modin/series/test_dt_accessor.py @@ -10,7 +10,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker -from tests.integ.modin.utils import eval_snowpark_pandas_result +from tests.integ.modin.utils import create_test_series, eval_snowpark_pandas_result dt_properties = pytest.mark.parametrize( "property_name", @@ -18,6 +18,40 @@ ) +@pytest.fixture +def day_of_week_or_year_data() -> native_pd.Series: + return native_pd.Series( + [ + pd.Timestamp( + year=2017, + month=1, + day=1, + hour=10, + minute=59, + second=59, + microsecond=5959, + ), + pd.Timestamp(year=2000, month=2, day=1), + pd.NaT, + pd.Timestamp(year=2024, month=7, day=29), + ], + ) + + +@pytest.fixture +def set_week_start(request): + original_start = ( + pd.session.sql("SHOW PARAMETERS LIKE 'WEEK_START'").collect()[0].value + ) + pd.session.connection.cursor().execute( + f"ALTER SESSION SET WEEK_START = {request.param};" + ) + yield + pd.session.connection.cursor().execute( + f"ALTER SESSION SET WEEK_START = {original_start};" + ) + + @pytest.mark.parametrize( "datetime_index_value", [ @@ -38,6 +72,31 @@ def test_date(datetime_index_value): eval_snowpark_pandas_result(snow_ser, native_ser, lambda ser: ser.dt.date) +@sql_count_checker(query_count=1) +@pytest.mark.parametrize("property", ["dayofyear", "day_of_year"]) +def test_day_of_year(property, day_of_week_or_year_data): + eval_snowpark_pandas_result( + *create_test_series(day_of_week_or_year_data), + lambda df: getattr(df.dt, property), + ) + + +@sql_count_checker(query_count=1) +@pytest.mark.parametrize("property", ["dayofweek", "day_of_week"]) +@pytest.mark.parametrize( + "set_week_start", + # Test different WEEK_START values because WEEK_START changes the DAYOFWEEK + # in Snowflake. + list(range(8)), + indirect=True, +) +def test_day_of_week(property, day_of_week_or_year_data, set_week_start): + eval_snowpark_pandas_result( + *create_test_series(day_of_week_or_year_data), + lambda df: getattr(df.dt, property), + ) + + @dt_properties @sql_count_checker(query_count=1) def test_dt_property_with_tz(property_name): diff --git a/tests/integ/modin/test_unimplemented.py b/tests/integ/modin/test_unimplemented.py index 70645a88f66..ddc03a17e14 100644 --- a/tests/integ/modin/test_unimplemented.py +++ b/tests/integ/modin/test_unimplemented.py @@ -152,7 +152,7 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None: # The full set of DateTimeAccessor test is under tests/integ/modin/series/test_dt_accessor.py UNSUPPORTED_DT_METHODS = [ (lambda ds: ds.dt.is_month_start, "property fget:is_month_start"), - (lambda ds: ds.dt.dayofweek, "property fget:dayofweek"), + (lambda ds: ds.dt.is_year_end, "property fget:is_year_end"), ] diff --git a/tests/unit/modin/test_series_dt.py b/tests/unit/modin/test_series_dt.py index a30f7794b88..3e965f76d9c 100644 --- a/tests/unit/modin/test_series_dt.py +++ b/tests/unit/modin/test_series_dt.py @@ -36,9 +36,7 @@ def mock_query_compiler_for_dt_series() -> SnowflakeQueryCompiler: (lambda s: s.dt.timetz, "timetz"), (lambda s: s.dt.microsecond, "microsecond"), (lambda s: s.dt.nanosecond, "nanosecond"), - (lambda s: s.dt.dayofweek, "dayofweek"), (lambda s: s.dt.weekday, "weekday"), - (lambda s: s.dt.dayofyear, "dayofyear"), (lambda s: s.dt.is_month_start, "is_month_start"), (lambda s: s.dt.is_month_end, "is_month_end"), (lambda s: s.dt.is_quarter_start, "is_quarter_start"), From e7813c14fa195e0992e0c59f713bd8dc5f4b4d7e Mon Sep 17 00:00:00 2001 From: Jianzhun Du <68252326+sfc-gh-jdu@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:16:26 -0700 Subject: [PATCH 12/12] Skip SQL count check in test (#1721) --- tests/integ/modin/conftest.py | 12 ++++++++++++ tests/integ/modin/sql_counter.py | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/integ/modin/conftest.py b/tests/integ/modin/conftest.py index c7aa506d625..9479ec99a55 100644 --- a/tests/integ/modin/conftest.py +++ b/tests/integ/modin/conftest.py @@ -45,6 +45,7 @@ def pytest_addoption(parser): parser.addoption( "--generate_pandas_api_coverage", action="store_true", default=False ) + parser.addoption("--skip_sql_count_check", action="store_true", default=False) @pytest.fixture(scope="session", autouse=True) @@ -54,6 +55,17 @@ def setup_pandas_api_coverage_generator(pytestconfig): PandasAPICoverageGenerator() +SKIP_SQL_COUNT_CHECK = False + + +@pytest.fixture(scope="session", autouse=True) +def setup_skip_sql_count_check(pytestconfig): + skip = pytestconfig.getoption("skip_sql_count_check") + if skip: + global SKIP_SQL_COUNT_CHECK + SKIP_SQL_COUNT_CHECK = True + + @pytest.fixture(scope="function") def sql_counter(): """Return a sql counter as pytest fixture""" diff --git a/tests/integ/modin/sql_counter.py b/tests/integ/modin/sql_counter.py index f3f12d031ed..8de38b8c8d5 100644 --- a/tests/integ/modin/sql_counter.py +++ b/tests/integ/modin/sql_counter.py @@ -119,8 +119,14 @@ def __init__( high_count_reason=None, **kwargs, ) -> "SqlCounter": + from tests.integ.modin.conftest import SKIP_SQL_COUNT_CHECK + self._queries: list[QueryRecord] = [] - self._no_check = no_check + + # Bypassing sql counter since + # 1. it is an unnecessary metric for tests running in stored procedures + # 2. pytest-assume package is not available in conda + self._no_check = no_check or IS_IN_STORED_PROC or SKIP_SQL_COUNT_CHECK # Save any expected sql counts initialized at start up. self._expected_sql_counts = {} @@ -357,12 +363,6 @@ def sql_count_checker( *args, **kwargs, ): - # Bypassing sql counter since - # 1. it is an unnecessary metric for tests running in stored procedures - # 2. pytest-assume package is not available in conda - if IS_IN_STORED_PROC: - return - """SqlCounter decorator that automatically validates the sql counts when test finishes.""" sql_counter = SqlCounter( no_check=no_check,