From 3ce073a47be83e25aa568bed4f44fe0150e6ba9b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jun 2024 16:02:05 -0700 Subject: [PATCH 01/20] [SNOW-1502893]: Add support for `pd.crosstab` --- CHANGELOG.md | 1 + docs/source/modin/general_functions.rst | 1 + .../modin/supported/general_supported.rst | 2 +- .../snowpark/modin/pandas/general.py | 145 ++++++++++++++++-- 4 files changed, 132 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d34d53844a..7de54de7a88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -70,6 +70,7 @@ - Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`. - Added support for `Series.dt.isocalendar`. - Added support for `Series.case_when` except when condition or replacement is callable. +- Added support for `pd.crosstab`. #### Bug Fixes diff --git a/docs/source/modin/general_functions.rst b/docs/source/modin/general_functions.rst index 737033e2190..40b6440473b 100644 --- a/docs/source/modin/general_functions.rst +++ b/docs/source/modin/general_functions.rst @@ -11,6 +11,7 @@ General functions :toctree: pandas_api/ melt + crosstab pivot_table cut qcut diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index 32c1142bb70..899c3c0e93f 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -18,7 +18,7 @@ Data manipulations | ``concat`` | P | ``levels`` is not supported, | | | | | ``copy`` is ignored | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``crosstab`` | N | | | +| ``crosstab`` | Y | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``cut`` | P | ``retbins``, ``labels`` | ``N`` if ``retbins=True``or ``labels!=False`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index bf0667e6365..8388c381b9c 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -22,7 +22,7 @@ """Implement pandas general API.""" from __future__ import annotations -from collections.abc import Hashable, Iterable, Mapping, Sequence +from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence from datetime import date, datetime, tzinfo from logging import getLogger from typing import TYPE_CHECKING, Any, Literal, Union @@ -47,7 +47,7 @@ _infer_tz_from_endpoints, _maybe_normalize_endpoints, ) -from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.common import is_list_like, is_nested_list_like from pandas.core.dtypes.inference import is_array_like from pandas.core.tools.datetimes import ( ArrayConvertible, @@ -1769,7 +1769,6 @@ def melt( @snowpark_pandas_telemetry_standalone_function_decorator -@pandas_module_level_function_not_implemented() @_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab") def crosstab( index, @@ -1786,20 +1785,134 @@ def crosstab( """ Compute a simple cross tabulation of two (or more) factors. """ - # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py - pandas_crosstab = pandas.crosstab( - index, - columns, - values, - rownames, - colnames, - aggfunc, - margins, - margins_name, - dropna, - normalize, + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + + if not is_nested_list_like(index): + index = [index] + if not is_nested_list_like(columns): + columns = [columns] + + from pandas.core.reshape.pivot import _build_names_mapper, _get_names + + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") + + ( + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, + ) = _build_names_mapper(rownames, colnames) + + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (Series, DataFrame))] + if pass_objs: + if len(pass_objs) == 1: + common_idx = pass_objs[0] + else: + common_idx = pass_objs[0].intersection(pass_objs[1:]) + + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } + df = DataFrame(data, index=common_idx) + + if values is None: + df["__dummy__"] = 0 + kwargs = {"aggfunc": "count"} + else: + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + table = df.pivot_table( + "__dummy__", + index=unique_rownames, + columns=unique_colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + # observed=dropna, + **kwargs, # type: ignore[arg-type] ) - return DataFrame(pandas_crosstab) + + if aggfunc is None: + # If no aggfunc is provided, we are computing frequencies. Since we use + # pivot_table above, pairs that are not observed will get a NaN value, + # so we need to fill all NaN values with 0. + table = table.fillna(0) + + # We must explicitly check that the value of normalize is not False here, + # as a valid value of normalize is `0` (for normalizing index). + if normalize is not False: + if normalize not in [0, 1, "index", "columns", "all", True]: + raise ValueError(f"Not a valid normalize argument: {normalize}") + if normalize is True: + normalize = "all" + normalize = {0: "index", 1: "columns"}.get(normalize, normalize) + + # Actual Normalizations + normalizers: dict[bool | str, Callable] = { + "all": lambda x: x / x.sum(axis=0).sum(), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis=0), + } + + if margins is False: + + f = normalizers[normalize] + + table = f(table) + table = table.fillna(0) + else: + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] + + # Normalize core + f = normalizers[normalize] + + table = f(table) + table = table.fillna(0) + + # Fix Margins + if normalize == "columns": + column_margin = column_margin / column_margin.sum() + table = pd.concat([table, column_margin], axis=1) + table = table.fillna(0) + table.columns = table_columns + + elif normalize == "index": + index_margin = index_margin / index_margin.sum() + table = table.append(index_margin, ignore_index=True) + table = table.fillna(0) + table.index = table_index + + elif normalize == "all": + column_margin = column_margin / column_margin.sum() + index_margin = index_margin / index_margin.sum() + index_margin.loc[margins_name] = 1 + table = pd.concat([table, column_margin], axis=1) + table = table.append(index_margin, ignore_index=True) + + table = table.fillna(0) + table.index = table_index + table.columns = table_columns + + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) + + return table # Adding docstring since pandas docs don't have web section for this function. From d9e5b798ddf06b5065f2e19467267fa168610463 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 13 Aug 2024 10:55:52 -0700 Subject: [PATCH 02/20] Add initial tests + implementation --- src/snowflake/snowpark/modin/pandas/base.py | 1 - .../snowpark/modin/pandas/general.py | 3 +- .../compiler/snowflake_query_compiler.py | 2 +- tests/integ/modin/crosstab/__init__.py | 3 + tests/integ/modin/crosstab/test_crosstab.py | 124 ++++++++++++++++++ 5 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 tests/integ/modin/crosstab/__init__.py create mode 100644 tests/integ/modin/crosstab/test_crosstab.py diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 43ed5ee389f..486fb8296ee 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -804,7 +804,6 @@ def aggregate( # TypeError: got an unexpected keyword argument 'skipna' if is_dict_like(func) and not uses_named_kwargs: kwargs.clear() - result = self.__constructor__( query_compiler=self._query_compiler.agg( func=func, diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 2ae9734fc8a..ece3de5b6fb 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1927,8 +1927,9 @@ def crosstab( if margins is False: f = normalizers[normalize] - + names = table.columns.names table = f(table) + table.columns.names = names table = table.fillna(0) else: # keep index and column of pivoted table diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 37e04e3643b..00ee487e5bc 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13863,7 +13863,7 @@ def infer_sorted_column_labels( new_frame = InternalFrame.create( ordered_dataframe=expanded_ordered_frame, data_column_pandas_labels=expanded_data_column_pandas_labels, - data_column_pandas_index_names=[None], # operation removes names + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, data_column_snowflake_quoted_identifiers=expanded_data_column_snowflake_quoted_identifiers, index_column_pandas_labels=index_column_pandas_labels, index_column_snowflake_quoted_identifiers=frame.index_column_snowflake_quoted_identifiers, diff --git a/tests/integ/modin/crosstab/__init__.py b/tests/integ/modin/crosstab/__init__.py new file mode 100644 index 00000000000..0fbef920926 --- /dev/null +++ b/tests/integ/modin/crosstab/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py new file mode 100644 index 00000000000..cfa254ef989 --- /dev/null +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -0,0 +1,124 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas + + +@sql_count_checker(query_count=4, join_count=1) +def test_basic_crosstab(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = np.array( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + snow_df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=4, join_count=4, union_count=3) +def test_basic_crosstab_margins(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = np.array( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + native_df = native_pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + ) + snow_df = pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=5, join_count=1) +@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) +def test_basic_crosstab_normalize(normalize): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = np.array( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + native_df = native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + ) + snow_df = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) From b883db730e9fb385fd1d788df36d007ef5418d3b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 13 Aug 2024 12:02:28 -0700 Subject: [PATCH 03/20] Add values tests --- .../snowpark/modin/pandas/general.py | 4 +- tests/integ/modin/crosstab/test_crosstab.py | 119 +++++++++++++++++- 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index ece3de5b6fb..1fca0aa5e94 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1876,7 +1876,9 @@ def crosstab( if len(pass_objs) == 1: common_idx = pass_objs[0] else: - common_idx = pass_objs[0].intersection(pass_objs[1:]) + common_idx = pass_objs[0].index.intersection( + [obj.index for obj in pass_objs[1:]] + ) data = { **dict(zip(unique_rownames, index)), diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index cfa254ef989..93c40c2691c 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -44,7 +44,7 @@ def test_basic_crosstab(): @sql_count_checker(query_count=4, join_count=4, union_count=3) -def test_basic_crosstab_margins(): +def test_margins(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], dtype=object, @@ -90,7 +90,7 @@ def test_basic_crosstab_margins(): @sql_count_checker(query_count=5, join_count=1) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) -def test_basic_crosstab_normalize(normalize): +def test_normalize(normalize): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], dtype=object, @@ -122,3 +122,118 @@ def test_basic_crosstab_normalize(normalize): a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize ) assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=6, join_count=12, union_count=8) +@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) +def test_normalize_and_margins(normalize): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = np.array( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + native_df = native_pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + ) + snow_df = pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=4, join_count=7) +@pytest.mark.parametrize("aggfunc", ["mean", "sum"]) +def test_values(aggfunc): + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + native_df_result = native_pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + ) + snow_df = pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result) + + +@sql_count_checker(query_count=4, join_count=7) +@pytest.mark.parametrize("aggfunc", ["mean", "sum"]) +def test_values_series_like(aggfunc): + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + snow_df = pd.DataFrame(native_df) + native_df = native_pd.crosstab( + native_df["species"], + native_df["favorite_food"], + values=native_df["age"], + aggfunc=aggfunc, + ) + snow_df = pd.crosstab( + snow_df["species"], + snow_df["favorite_food"], + values=snow_df["age"], + aggfunc=aggfunc, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) From 794b592bbf063e12b36d71a194a9e3183d9c7bbe Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 20 Aug 2024 14:55:28 -0700 Subject: [PATCH 04/20] Add more support --- .../snowpark/modin/pandas/general.py | 124 +++++++-- .../compiler/snowflake_query_compiler.py | 13 +- tests/integ/modin/crosstab/test_crosstab.py | 259 +++++++++++++++++- 3 files changed, 372 insertions(+), 24 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 1fca0aa5e94..51787b54d53 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1858,10 +1858,28 @@ def crosstab( if not is_nested_list_like(columns): columns = [columns] + user_passed_rownames = rownames is not None + user_passed_colnames = colnames is not None + from pandas.core.reshape.pivot import _build_names_mapper, _get_names - rownames = _get_names(index, rownames, prefix="row") - colnames = _get_names(columns, colnames, prefix="col") + def _get_names_wrapper(list_of_objs, names, prefix): + """ + Helper method to expand DataFrame objects containing + multiple columns into Series, since `_get_names` expects + one column per entry. + """ + expanded_list_of_objs = [] + for obj in list_of_objs: + if isinstance(obj, DataFrame): + for col in obj.columns: + expanded_list_of_objs.append(obj[col]) + else: + expanded_list_of_objs.append(obj) + return _get_names(expanded_list_of_objs, names, prefix) + + rownames = _get_names_wrapper(index, rownames, prefix="row") + colnames = _get_names_wrapper(columns, colnames, prefix="col") ( rownames_mapper, @@ -1870,21 +1888,89 @@ def crosstab( unique_colnames, ) = _build_names_mapper(rownames, colnames) - common_idx = None pass_objs = [x for x in index + columns if isinstance(x, (Series, DataFrame))] + row_idx_names = None + col_idx_names = None if pass_objs: - if len(pass_objs) == 1: - common_idx = pass_objs[0] - else: - common_idx = pass_objs[0].index.intersection( - [obj.index for obj in pass_objs[1:]] - ) - - data = { - **dict(zip(unique_rownames, index)), - **dict(zip(unique_colnames, columns)), - } - df = DataFrame(data, index=common_idx) + # If we have any Snowpark pandas objects in the index or columns, then we + # need to find the intersection of their indices, and only pick rows from + # the objects that have indices in the intersection of their indices. + # After we do that, we then need to append the non Snowpark pandas objects + # using the intersection of indices as the final index for the DataFrame object. + # First, we separate the objects into Snowpark pandas objects, and non-Snowpark + # pandas objects (while renaming them so that they have unique names). + rownames_idx = 0 + row_idx_names = [] + dfs = [] + arrays = [] + for obj in index: + if isinstance(obj, Series): + row_idx_names.append(obj.name) + df = pd.DataFrame(obj) + df.columns = [unique_rownames[rownames_idx]] + rownames_idx += 1 + dfs.append(df) + elif isinstance(obj, DataFrame): + row_idx_names.extend(obj.columns) + obj.columns = unique_rownames[ + rownames_idx : rownames_idx + len(obj.columns) + ] + rownames_idx += len(obj.columns) + dfs.append(obj) + else: + row_idx_names.append(None) + df = pd.DataFrame(obj) + df.columns = unique_rownames[ + rownames_idx : rownames_idx + len(df.columns) + ] + rownames_idx += len(df.columns) + arrays.append(df) + + colnames_idx = 0 + col_idx_names = [] + for obj in columns: + if isinstance(obj, Series): + col_idx_names.append(obj.name) + df = pd.DataFrame(obj) + df.columns = [unique_colnames[colnames_idx]] + colnames_idx += 1 + dfs.append(df) + elif isinstance(obj, DataFrame): + col_idx_names.extend(obj.columns) + obj.columns = unique_colnames[ + colnames_idx : colnames_idx + len(obj.columns) + ] + colnames_idx += len(obj.columns) + dfs.append(obj) + else: + col_idx_names.append(None) + df = pd.DataFrame(obj) + df.columns = unique_colnames[ + colnames_idx : colnames_idx + len(df.columns) + ] + colnames_idx += len(df.columns) + arrays.append(df) + + # Now, we have two lists - a list of Snowpark pandas objects, and a list of objects + # that were not passed in as Snowpark pandas objects, but that we have converted + # to Snowpark pandas objects to give them column names. We can perform inner joins + # on the dfs list to get a DataFrame with the final index (that is only an intersection + # of indices.) + df = dfs[0] + for right in dfs[1:]: + df = df.merge(right, left_index=True, right_index=True) + + if len(arrays) > 0: + index = df.index + right_df = pd.concat(arrays, axis=1) + right_df.index = index + df = df.merge(right_df, left_index=True, right_index=True) + else: + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } + df = DataFrame(data) if values is None: df["__dummy__"] = 0 @@ -1904,6 +1990,12 @@ def crosstab( **kwargs, # type: ignore[arg-type] ) + if row_idx_names is not None and not user_passed_rownames: + table.index = table.index.set_names(row_idx_names) + + if col_idx_names is not None and not user_passed_colnames: + table.columns = table.columns.set_names(col_idx_names) + if aggfunc is None: # If no aggfunc is provided, we are computing frequencies. Since we use # pivot_table above, pairs that are not observed will get a NaN value, @@ -1923,7 +2015,7 @@ def crosstab( normalizers: dict[bool | str, Callable] = { "all": lambda x: x / x.sum(axis=0).sum(), "columns": lambda x: x / x.sum(), - "index": lambda x: x.div(x.sum(axis=1), axis=0), + "index": lambda x: x.div(x.sum(axis=1)), } if margins is False: diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 00ee487e5bc..27f3d88c536 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5272,11 +5272,14 @@ def agg( ) for agg_arg in agg_args } + pandas_labels = list(agg_col_map.keys()) + if self.is_multiindex(axis=1): + pandas_labels = [ + (label,) * len(self.columns.names) for label in pandas_labels + ] single_agg_func_query_compilers.append( SnowflakeQueryCompiler( - frame.project_columns( - list(agg_col_map.keys()), list(agg_col_map.values()) - ) + frame.project_columns(pandas_labels, list(agg_col_map.values())) ) ) else: # axis == 0 @@ -13816,9 +13819,7 @@ def infer_sorted_column_labels( new_frame = InternalFrame.create( ordered_dataframe=expanded_ordered_frame, data_column_pandas_labels=sorted_column_labels, - data_column_pandas_index_names=[ - None - ], # operation removes column index name always. + data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names, data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers + new_identifiers, index_column_pandas_labels=index_column_pandas_labels, diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 93c40c2691c..9ab416e0e14 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -2,6 +2,8 @@ # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import re + import modin.pandas as pd import numpy as np import pandas as native_pd @@ -9,7 +11,10 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import sql_count_checker -from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas +from tests.integ.modin.utils import ( + assert_snowpark_pandas_equal_to_pandas, + eval_snowpark_pandas_result, +) @sql_count_checker(query_count=4, join_count=1) @@ -43,6 +48,251 @@ def test_basic_crosstab(): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) +@sql_count_checker(query_count=6, join_count=14) +def test_basic_crosstab_with_series_objs_full_overlap(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = native_pd.Series( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + ) + native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + snow_df = pd.crosstab( + a, [pd.Series(b), pd.Series(c)], rownames=["a"], colnames=["b", "c"] + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=6, join_count=14) +def test_basic_crosstab_with_series_objs_some_overlap(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = native_pd.Series( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + index=list(range(len(a))), + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 * np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)). In + # this test, we truncate the numpy column so that the lengths are correct. + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a[:1], [b, c], rownames=["a"], colnames=["b", "c"] + ) + else: + return pd.crosstab(a[:1], [b, c], rownames=["a"], colnames=["b", "c"]) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + + +@sql_count_checker(query_count=2, join_count=1) +def test_basic_crosstab_with_series_objs_some_overlap_error(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = native_pd.Series( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + index=list(range(len(a))), + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 * np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + else: + return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 1" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) + + +@sql_count_checker(query_count=2, join_count=1) +def test_basic_crosstab_with_series_objs_no_overlap_error(): + a = np.array( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = native_pd.Series( + ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], + index=list(range(len(a))), + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 - np.array(list(range(len(a)))), + ) + + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + else: + return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 0" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) + + +@sql_count_checker(query_count=7, join_count=4) +def test_basic_crosstab_with_df_and_series_objs_pandas_errors(): + a = native_pd.Series( + ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], + dtype=object, + ) + b = native_pd.DataFrame( + { + "0": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "1": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + # pandas expects only Series objects, or DataFrames that have only a single column, while + # we support accepting DataFrames with multiple columns. + with pytest.raises( + AssertionError, match="arrays and names must have the same length" + ): + native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"]) + + def eval_func(args_list): + a, b = args_list + if isinstance(a, native_pd.Series): + return native_pd.crosstab( + a, [b[c] for c in b.columns], rownames=["a"], colnames=["b", "c"] + ) + else: + return pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"]) + + native_args = [a, b] + snow_args = [pd.Series(a), pd.DataFrame(b)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + + @sql_count_checker(query_count=4, join_count=4, union_count=3) def test_margins(): a = np.array( @@ -204,7 +454,12 @@ def test_values(aggfunc): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result) -@sql_count_checker(query_count=4, join_count=7) +@sql_count_checker( + query_count=11, + join_count=14, + high_count_expected=True, + high_count_reason="Need to perform multiple joins followed by a pivot_table, which itself includes multiple joins.", +) @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) def test_values_series_like(aggfunc): native_df = native_pd.DataFrame( From f76a0341267ea5222673661c1e3ef8f37724930e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 20 Aug 2024 18:27:45 -0700 Subject: [PATCH 05/20] Add support for everything but normalize + margins together --- .../snowpark/modin/pandas/general.py | 2 +- .../compiler/snowflake_query_compiler.py | 30 +++++++++++++- tests/integ/modin/crosstab/test_crosstab.py | 40 +++++++++---------- 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 51787b54d53..0d0bf4cdeb6 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2015,7 +2015,7 @@ def _get_names_wrapper(list_of_objs, names, prefix): normalizers: dict[bool | str, Callable] = { "all": lambda x: x / x.sum(axis=0).sum(), "columns": lambda x: x / x.sum(), - "index": lambda x: x.div(x.sum(axis=1)), + "index": lambda x: x.div(x.sum(axis=1), axis="index"), } if margins is False: diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 13c524f7762..4e89dbdd7e9 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -13673,7 +13673,6 @@ def create_lazy_type_functions( assert len(right_result_data_identifiers) == 1, "other must be a Series" right = right_result_data_identifiers[0] right_datatype = right_datatypes[0] - # now replace in result frame identifiers with binary op result update_result = joined_frame.result_frame.update_snowflake_quoted_identifiers_with_expressions( { @@ -13691,6 +13690,7 @@ def create_lazy_type_functions( identifiers_to_keep = set( new_frame.index_column_snowflake_quoted_identifiers ) | set(update_result.old_id_to_new_id_mappings.values()) + self_is_column_mi = len(self._modin_frame.data_column_pandas_index_names) label_to_snowflake_quoted_identifier = tuple( filter( lambda pair: pair.snowflake_quoted_identifier in identifiers_to_keep, @@ -13698,11 +13698,37 @@ def create_lazy_type_functions( ) ) + first_column_label = label_to_snowflake_quoted_identifier[ + new_frame.num_index_columns + ].label + + if ( + self_is_column_mi + and isinstance(first_column_label, tuple) + and isinstance(first_column_label[0], tuple) + ): + from snowflake.snowpark.modin.plugin._internal.frame import ( + LabelIdentifierPair, + ) + + new_label_to_snowflake_quoted_identifier = list( + label_to_snowflake_quoted_identifier[: new_frame.num_index_columns] + ) + for pair in label_to_snowflake_quoted_identifier[ + new_frame.num_index_columns : + ]: + new_label_to_snowflake_quoted_identifier.append( + LabelIdentifierPair(pair.label[0], pair.snowflake_quoted_identifier) + ) + label_to_snowflake_quoted_identifier = tuple( + new_label_to_snowflake_quoted_identifier + ) + new_frame = InternalFrame( ordered_dataframe=new_frame.ordered_dataframe, label_to_snowflake_quoted_identifier=label_to_snowflake_quoted_identifier, num_index_columns=new_frame.num_index_columns, - data_column_index_names=new_frame.data_column_index_names, + data_column_index_names=self._modin_frame.data_column_index_names, snowflake_quoted_identifier_to_snowpark_pandas_type={ pair.snowflake_quoted_identifier: None for pair in label_to_snowflake_quoted_identifier diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 9ab416e0e14..3d6bd149b24 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -10,14 +10,14 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, eval_snowpark_pandas_result, ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3) def test_basic_crosstab(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], @@ -48,7 +48,7 @@ def test_basic_crosstab(): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) -@sql_count_checker(query_count=6, join_count=14) +@sql_count_checker(query_count=5, join_count=13) def test_basic_crosstab_with_series_objs_full_overlap(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], @@ -79,7 +79,7 @@ def test_basic_crosstab_with_series_objs_full_overlap(): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) -@sql_count_checker(query_count=6, join_count=14) +@sql_count_checker(query_count=5, join_count=13) def test_basic_crosstab_with_series_objs_some_overlap(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], @@ -232,7 +232,7 @@ def eval_func(args_list): ) -@sql_count_checker(query_count=7, join_count=4) +@sql_count_checker(query_count=6, join_count=3) def test_basic_crosstab_with_df_and_series_objs_pandas_errors(): a = native_pd.Series( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], @@ -293,7 +293,7 @@ def eval_func(args_list): ) -@sql_count_checker(query_count=4, join_count=4, union_count=3) +@sql_count_checker(query_count=3, join_count=3, union_count=3) def test_margins(): a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], @@ -338,9 +338,10 @@ def test_margins(): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) -@sql_count_checker(query_count=5, join_count=1) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize(normalize): + query_count = 4 if normalize not in (0, "index") else 3 + join_count = 0 if normalize not in (0, "index") else 3 a = np.array( ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], dtype=object, @@ -365,16 +366,17 @@ def test_normalize(normalize): ], dtype=object, ) - native_df = native_pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize - ) - snow_df = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + with SqlCounter(query_count=query_count, join_count=join_count): + native_df = native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + ) + snow_df = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) -@sql_count_checker(query_count=6, join_count=12, union_count=8) +@sql_count_checker(query_count=5, join_count=11, union_count=8) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize_and_margins(normalize): a = np.array( @@ -420,7 +422,7 @@ def test_normalize_and_margins(normalize): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) -@sql_count_checker(query_count=4, join_count=7) +@sql_count_checker(query_count=3, join_count=6) @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) def test_values(aggfunc): native_df = native_pd.DataFrame( @@ -455,10 +457,8 @@ def test_values(aggfunc): @sql_count_checker( - query_count=11, - join_count=14, - high_count_expected=True, - high_count_reason="Need to perform multiple joins followed by a pivot_table, which itself includes multiple joins.", + query_count=9, + join_count=10, ) @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) def test_values_series_like(aggfunc): From d9c759fbc657d4074a70e3390697388a858b04b9 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 20 Aug 2024 18:40:17 -0700 Subject: [PATCH 06/20] Add support for normalize + margins on rows --- src/snowflake/snowpark/modin/pandas/general.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 0d0bf4cdeb6..0f320c19c4e 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2050,8 +2050,8 @@ def _get_names_wrapper(list_of_objs, names, prefix): table.columns = table_columns elif normalize == "index": - index_margin = index_margin / index_margin.sum() - table = table.append(index_margin, ignore_index=True) + index_margin = pd.DataFrame(index_margin / index_margin.sum()).T + table = pd.concat([table, index_margin]).reset_index(drop=True) table = table.fillna(0) table.index = table_index From c84fe32c64c3365f948d0784bac8e967c728cc01 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 21 Aug 2024 16:14:34 -0700 Subject: [PATCH 07/20] Wrap up implementation of crosstab --- .../snowpark/modin/pandas/general.py | 38 +- tests/integ/modin/crosstab/test_crosstab.py | 1147 ++++++++++------- 2 files changed, 729 insertions(+), 456 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 0f320c19c4e..88560936fdf 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1986,7 +1986,6 @@ def _get_names_wrapper(list_of_objs, names, prefix): margins=margins, margins_name=margins_name, dropna=dropna, - # observed=dropna, **kwargs, # type: ignore[arg-type] ) @@ -2031,36 +2030,37 @@ def _get_names_wrapper(list_of_objs, names, prefix): table_columns = table.columns column_margin = table.iloc[:-1, -1] - index_margin = table.iloc[-1, :-1] - # keep the core table - table = table.iloc[:-1, :-1] - - # Normalize core - f = normalizers[normalize] - - table = f(table) - table = table.fillna(0) - - # Fix Margins if normalize == "columns": + # keep the core table + table = table.iloc[:-1, :-1] + + # Normalize core + f = normalizers[normalize] + table = f(table) + table = table.fillna(0) + # Fix Margins column_margin = column_margin / column_margin.sum() table = pd.concat([table, column_margin], axis=1) table = table.fillna(0) table.columns = table_columns elif normalize == "index": - index_margin = pd.DataFrame(index_margin / index_margin.sum()).T - table = pd.concat([table, index_margin]).reset_index(drop=True) - table = table.fillna(0) - table.index = table_index + table = table.iloc[:, :-1] + + # Normalize core + f = normalizers[normalize] + table = f(table) + table = table.fillna(0).reindex(index=table_index) elif normalize == "all": + # Normalize core + f = normalizers[normalize] + table = f(table.iloc[:, :-1]) * 2.0 + column_margin = column_margin / column_margin.sum() - index_margin = index_margin / index_margin.sum() - index_margin.loc[margins_name] = 1 table = pd.concat([table, column_margin], axis=1) - table = table.append(index_margin, ignore_index=True) + table.iloc[-1, -1] = 1 table = table.fillna(0) table.index = table_index diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 3d6bd149b24..8dace579af7 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -17,230 +17,89 @@ ) -@sql_count_checker(query_count=3) -def test_basic_crosstab(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = np.array( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - snow_df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) - - -@sql_count_checker(query_count=5, join_count=13) -def test_basic_crosstab_with_series_objs_full_overlap(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = native_pd.Series( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - ) - c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - ) - native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - snow_df = pd.crosstab( - a, [pd.Series(b), pd.Series(c)], rownames=["a"], colnames=["b", "c"] - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) - - -@sql_count_checker(query_count=5, join_count=13) -def test_basic_crosstab_with_series_objs_some_overlap(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = native_pd.Series( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - index=list(range(len(a))), - ) - c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - index=-1 * np.array(list(range(len(a)))), - ) - - # All columns have to be the same length (if NumPy arrays are present, then - # pandas errors if they do not match the length of the other Series after - # they are joined (i.e. filtered so that their indices are the same)). In - # this test, we truncate the numpy column so that the lengths are correct. - def eval_func(args_list): - a, b, c = args_list - if isinstance(b, native_pd.Series): - return native_pd.crosstab( - a[:1], [b, c], rownames=["a"], colnames=["b", "c"] +@pytest.mark.parametrize("dropna", [True, False]) +class TestCrosstab: + def test_basic_crosstab(self, dropna): + query_count = 3 + join_count = 0 if dropna else 3 + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + with SqlCounter(query_count=query_count, join_count=join_count): + native_df = native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna ) - else: - return pd.crosstab(a[:1], [b, c], rownames=["a"], colnames=["b", "c"]) - - native_args = [a, b, c] - snow_args = [a, pd.Series(b), pd.Series(c)] - eval_snowpark_pandas_result( - snow_args, - native_args, - eval_func, - ) - - -@sql_count_checker(query_count=2, join_count=1) -def test_basic_crosstab_with_series_objs_some_overlap_error(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = native_pd.Series( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - index=list(range(len(a))), - ) - c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - index=-1 * np.array(list(range(len(a)))), - ) - - # All columns have to be the same length (if NumPy arrays are present, then - # pandas errors if they do not match the length of the other Series after - # they are joined (i.e. filtered so that their indices are the same)) - def eval_func(args_list): - a, b, c = args_list - if isinstance(b, native_pd.Series): - return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - else: - return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - - native_args = [a, b, c] - snow_args = [a, pd.Series(b), pd.Series(c)] - eval_snowpark_pandas_result( - snow_args, - native_args, - eval_func, - expect_exception=True, - expect_exception_match=re.escape( - "Length mismatch: Expected 11 rows, received array of length 1" - ), - expect_exception_type=ValueError, - assert_exception_equal=False, # Our error message is a little different. - ) - - -@sql_count_checker(query_count=2, join_count=1) -def test_basic_crosstab_with_series_objs_no_overlap_error(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = native_pd.Series( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - index=list(range(len(a))), - ) - c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - index=-1 - np.array(list(range(len(a)))), - ) - - # All columns have to be the same length (if NumPy arrays are present, then - # pandas errors if they do not match the length of the other Series after - # they are joined (i.e. filtered so that their indices are the same)) - def eval_func(args_list): - a, b, c = args_list - if isinstance(b, native_pd.Series): - return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - else: - return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - - native_args = [a, b, c] - snow_args = [a, pd.Series(b), pd.Series(c)] - eval_snowpark_pandas_result( - snow_args, - native_args, - eval_func, - expect_exception=True, - expect_exception_match=re.escape( - "Length mismatch: Expected 11 rows, received array of length 0" - ), - expect_exception_type=ValueError, - assert_exception_equal=False, # Our error message is a little different. - ) - - -@sql_count_checker(query_count=6, join_count=3) -def test_basic_crosstab_with_df_and_series_objs_pandas_errors(): - a = native_pd.Series( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = native_pd.DataFrame( - { - "0": [ + snow_df = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + def test_basic_crosstab_with_series_objs_full_overlap(self, dropna): + query_count = 5 + join_count = 13 if dropna else 28 + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = native_pd.Series( + [ "one", "one", "one", @@ -253,7 +112,9 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors(): "two", "one", ], - "1": [ + ) + c = native_pd.Series( + [ "dull", "dull", "shiny", @@ -266,229 +127,641 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors(): "shiny", "shiny", ], - } - ) - # pandas expects only Series objects, or DataFrames that have only a single column, while - # we support accepting DataFrames with multiple columns. - with pytest.raises( - AssertionError, match="arrays and names must have the same length" - ): - native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"]) - - def eval_func(args_list): - a, b = args_list - if isinstance(a, native_pd.Series): - return native_pd.crosstab( - a, [b[c] for c in b.columns], rownames=["a"], colnames=["b", "c"] + ) + with SqlCounter(query_count=query_count, join_count=join_count): + native_df = native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna ) - else: - return pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"]) - - native_args = [a, b] - snow_args = [pd.Series(a), pd.DataFrame(b)] - eval_snowpark_pandas_result( - snow_args, - native_args, - eval_func, - ) - - -@sql_count_checker(query_count=3, join_count=3, union_count=3) -def test_margins(): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = np.array( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - native_df = native_pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - margins=True, - margins_name="MARGINS_NAME", - ) - snow_df = pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - margins=True, - margins_name="MARGINS_NAME", - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + snow_df = pd.crosstab( + a, + [pd.Series(b), pd.Series(c)], + rownames=["a"], + colnames=["b", "c"], + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + def test_basic_crosstab_with_series_objs_some_overlap(self, dropna): + query_count = 5 + join_count = 13 if dropna else 28 + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = native_pd.Series( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + index=list(range(len(a))), + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 * np.array(list(range(len(a)))), + ) + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)). In + # this test, we truncate the numpy column so that the lengths are correct. + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) -@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) -def test_normalize(normalize): - query_count = 4 if normalize not in (0, "index") else 3 - join_count = 0 if normalize not in (0, "index") else 3 - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = np.array( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - with SqlCounter(query_count=query_count, join_count=join_count): - native_df = native_pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + @sql_count_checker(query_count=2, join_count=1) + def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = native_pd.Series( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + index=list(range(len(a))), ) - snow_df = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 * np.array(list(range(len(a)))), ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 1" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) -@sql_count_checker(query_count=5, join_count=11, union_count=8) -@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) -def test_normalize_and_margins(normalize): - a = np.array( - ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"], - dtype=object, - ) - b = np.array( - ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - native_df = native_pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - margins=True, - ) - snow_df = pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - margins=True, - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + @sql_count_checker(query_count=2, join_count=1) + def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = native_pd.Series( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + index=list(range(len(a))), + ) + c = native_pd.Series( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + index=-1 - np.array(list(range(len(a)))), + ) + # All columns have to be the same length (if NumPy arrays are present, then + # pandas errors if they do not match the length of the other Series after + # they are joined (i.e. filtered so that their indices are the same)) + def eval_func(args_list): + a, b, c = args_list + if isinstance(b, native_pd.Series): + return native_pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + else: + return pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + native_args = [a, b, c] + snow_args = [a, pd.Series(b), pd.Series(c)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + expect_exception=True, + expect_exception_match=re.escape( + "Length mismatch: Expected 11 rows, received array of length 0" + ), + expect_exception_type=ValueError, + assert_exception_equal=False, # Our error message is a little different. + ) -@sql_count_checker(query_count=3, join_count=6) -@pytest.mark.parametrize("aggfunc", ["mean", "sum"]) -def test_values(aggfunc): - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", + def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna): + query_count = 6 + join_count = 3 if dropna else 9 + a = native_pd.Series( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], - } - ) - native_df_result = native_pd.crosstab( - native_df["species"].values, - native_df["favorite_food"].values, - values=native_df["age"].values, - aggfunc=aggfunc, - ) - snow_df = pd.crosstab( - native_df["species"].values, - native_df["favorite_food"].values, - values=native_df["age"].values, - aggfunc=aggfunc, - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result) - + dtype=object, + ) + b = native_pd.DataFrame( + { + "0": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "1": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + # pandas expects only Series objects, or DataFrames that have only a single column, while + # we support accepting DataFrames with multiple columns. + with pytest.raises( + AssertionError, match="arrays and names must have the same length" + ): + native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna) + + def eval_func(args_list): + a, b = args_list + if isinstance(a, native_pd.Series): + return native_pd.crosstab( + a, + [b[c] for c in b.columns], + rownames=["a"], + colnames=["b", "c"], + dropna=dropna, + ) + else: + return pd.crosstab( + a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b] + snow_args = [pd.Series(a), pd.DataFrame(b)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) -@sql_count_checker( - query_count=9, - join_count=10, -) -@pytest.mark.parametrize("aggfunc", ["mean", "sum"]) -def test_values_series_like(aggfunc): - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", + def test_margins(self, dropna): + query_count = 3 + join_count = 3 if dropna else 6 + union_count = 3 + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + with SqlCounter( + query_count=query_count, join_count=join_count, union_count=union_count + ): + native_df = native_pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + dropna=dropna, + ) + snow_df = pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + def test_normalize(self, dropna, normalize): + query_count = 4 if normalize not in (0, "index") else 3 + join_count = 0 if normalize not in (0, "index") else 3 + if not dropna: + join_count = 9 if normalize in (0, "index") else 4 + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + with SqlCounter(query_count=query_count, join_count=join_count): + native_df = native_pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + dropna=dropna, + ) + snow_df = pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + def test_normalize_and_margins(self, dropna, normalize): + counts = { + "columns": [5, 11 if dropna else 19, 8], + "index": [3, 15 if dropna else 24, 9], + "all": [7, 41 if dropna else 63, 22], } - ) - snow_df = pd.DataFrame(native_df) - native_df = native_pd.crosstab( - native_df["species"], - native_df["favorite_food"], - values=native_df["age"], - aggfunc=aggfunc, - ) - snow_df = pd.crosstab( - snow_df["species"], - snow_df["favorite_food"], - values=snow_df["age"], - aggfunc=aggfunc, - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + counts[0] = counts["index"] + counts[1] = counts["columns"] + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + union_count=sql_counts[2], + ): + native_df = native_pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + dropna=dropna, + ) + snow_df = pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) + def test_values(self, dropna, aggfunc): + query_count = 3 + join_count = 6 if dropna else 15 + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_df_result = native_pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + dropna=dropna, + ) + snow_df = pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result) + + @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) + def test_values_series_like(self, dropna, aggfunc): + query_count = 9 + join_count = 10 if dropna else 25 + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + snow_df = pd.DataFrame(native_df) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_df = native_pd.crosstab( + native_df["species"], + native_df["favorite_food"], + values=native_df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + snow_df = pd.crosstab( + snow_df["species"], + snow_df["favorite_food"], + values=snow_df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) From f668414698a8eef93a60545e55233ae63097a4e3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 21 Aug 2024 16:24:37 -0700 Subject: [PATCH 08/20] Fix small bug after merge --- .../modin/plugin/compiler/snowflake_query_compiler.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 9c5f5027b6d..da2a001f4e3 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -195,7 +195,10 @@ compute_bin_indices, preprocess_bins_for_cut, ) -from snowflake.snowpark.modin.plugin._internal.frame import InternalFrame +from snowflake.snowpark.modin.plugin._internal.frame import ( + InternalFrame, + LabelIdentifierPair, +) from snowflake.snowpark.modin.plugin._internal.groupby_utils import ( check_is_groupby_supported_by_snowflake, extract_groupby_column_pandas_labels, @@ -13737,7 +13740,9 @@ def create_lazy_type_functions( and isinstance(pair.label, tuple) and isinstance(pair.label[0], tuple) ): - pair.label = pair.label[0] + pair = LabelIdentifierPair( + pair.label[0], pair.snowflake_quoted_identifier + ) label_to_snowflake_quoted_identifier.append(pair) snowflake_quoted_identifier_to_snowpark_pandas_type[ pair.snowflake_quoted_identifier From f2ba2f4f061026ec24ce2782b80a1fac3dd51aff Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 21 Aug 2024 16:54:36 -0700 Subject: [PATCH 09/20] Add docstrings --- .../snowpark/modin/pandas/general.py | 55 ++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 88560936fdf..d89e72eaf24 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1831,7 +1831,6 @@ def melt( @snowpark_pandas_telemetry_standalone_function_decorator -@_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab") def crosstab( index, columns, @@ -1846,6 +1845,60 @@ def crosstab( ) -> DataFrame: # noqa: PR01, RT01, D200 """ Compute a simple cross tabulation of two (or more) factors. + + By default, computes a frequency table of the factors unless an array + of values and an aggregation function are passed. + + Parameters + ---------- + index : array-like, Series, or list of arrays/Series + Values to group by in the rows. + columns : array-like, Series, or list of arrays/Series + Values to group by in the columns. + values : array-like, optional + Array of values to aggregate according to the factors. + Requires aggfunc be specified. + rownames : sequence, default None + If passed, must match number of row arrays passed. + colnames : sequence, default None + If passed, must match number of column arrays passed. + aggfunc : function, optional + If specified, requires values be specified as well. + margins : bool, default False + Add row/column margins (subtotals). + margins_name : str, default 'All' + Name of the row/column that will contain the totals when margins is True. + dropna : bool, default True + Do not include columns whose entries are all NaN. + + normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False + Normalize by dividing all values by the sum of values. + + * If passed 'all' or True, will normalize over all values. + * If passed 'index' will normalize over each row. + * If passed 'columns' will normalize over each column. + * If margins is True, will also normalize margin values. + + Returns + ------- + DataFrame + Cross tabulation of the data. + + Examples + -------- + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) # doctest: +NORMALIZE_WHITESPACE + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 """ if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") From da12522e18cfb66d93c81b4403f051bd0377c4d7 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 27 Aug 2024 20:01:43 -0700 Subject: [PATCH 10/20] Fix tests, address review comments --- src/snowflake/snowpark/modin/pandas/base.py | 1 + .../snowpark/modin/pandas/general.py | 13 +- tests/integ/modin/crosstab/__init__.py | 3 - tests/integ/modin/crosstab/test_crosstab.py | 316 ++++++++++++------ 4 files changed, 233 insertions(+), 100 deletions(-) delete mode 100644 tests/integ/modin/crosstab/__init__.py diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py index 9b5bd4d8d4e..c08cdee1386 100644 --- a/src/snowflake/snowpark/modin/pandas/base.py +++ b/src/snowflake/snowpark/modin/pandas/base.py @@ -801,6 +801,7 @@ def aggregate( # TypeError: got an unexpected keyword argument 'skipna' if is_dict_like(func) and not uses_named_kwargs: kwargs.clear() + result = self.__constructor__( query_compiler=self._query_compiler.agg( func=func, diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 3c5f48b6d4f..877ad4578d9 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2103,6 +2103,7 @@ def _get_names_wrapper(list_of_objs, names, prefix): row_idx_names = [] dfs = [] arrays = [] + array_lengths = [] for obj in index: if isinstance(obj, Series): row_idx_names.append(obj.name) @@ -2119,6 +2120,7 @@ def _get_names_wrapper(list_of_objs, names, prefix): dfs.append(obj) else: row_idx_names.append(None) + array_lengths.append(len(obj)) df = pd.DataFrame(obj) df.columns = unique_rownames[ rownames_idx : rownames_idx + len(df.columns) @@ -2144,6 +2146,7 @@ def _get_names_wrapper(list_of_objs, names, prefix): dfs.append(obj) else: col_idx_names.append(None) + array_lengths.append(len(obj)) df = pd.DataFrame(obj) df.columns = unique_colnames[ colnames_idx : colnames_idx + len(df.columns) @@ -2151,6 +2154,9 @@ def _get_names_wrapper(list_of_objs, names, prefix): colnames_idx += len(df.columns) arrays.append(df) + if len(set(array_lengths)) > 1: + raise ValueError("All arrays must be of the same length") + # Now, we have two lists - a list of Snowpark pandas objects, and a list of objects # that were not passed in as Snowpark pandas objects, but that we have converted # to Snowpark pandas objects to give them column names. We can perform inner joins @@ -2159,10 +2165,15 @@ def _get_names_wrapper(list_of_objs, names, prefix): df = dfs[0] for right in dfs[1:]: df = df.merge(right, left_index=True, right_index=True) - if len(arrays) > 0: index = df.index right_df = pd.concat(arrays, axis=1) + # Increases query count by 1, but necessary for error checking. + index_length = len(df) + if index_length != array_lengths[0]: + raise ValueError( + f"Length mismatch: Expected {array_lengths[0]} rows, received array of length {index_length}" + ) right_df.index = index df = df.merge(right_df, left_index=True, right_index=True) else: diff --git a/tests/integ/modin/crosstab/__init__.py b/tests/integ/modin/crosstab/__init__.py deleted file mode 100644 index 0fbef920926..00000000000 --- a/tests/integ/modin/crosstab/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# -# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. -# diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 8dace579af7..48fa27bd85b 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -19,9 +19,9 @@ @pytest.mark.parametrize("dropna", [True, False]) class TestCrosstab: - def test_basic_crosstab(self, dropna): - query_count = 3 - join_count = 0 if dropna else 3 + def test_basic_crosstab_with_numpy_arrays(self, dropna): + query_count = 1 + join_count = 0 if dropna else 1 a = np.array( [ "foo", @@ -71,17 +71,79 @@ def test_basic_crosstab(self, dropna): dtype=object, ) with SqlCounter(query_count=query_count, join_count=join_count): - native_df = native_pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ), ) - snow_df = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + + def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + with SqlCounter(query_count=0): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ), + assert_exception_equal=True, + expect_exception=True, + expect_exception_match="All arrays must be of the same length", + expect_exception_type=ValueError, ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + # In these tests, `overlap` refers to the intersection of the indices + # of the Series objects being passed in to crosstab. crosstab takes + # only the intersection of the index objects of all Series when determining + # the final DataFrame to pass into pivot_table, so here, we are testing + # that we follow that behavior. def test_basic_crosstab_with_series_objs_full_overlap(self, dropna): - query_count = 5 - join_count = 13 if dropna else 28 + # In this case, all indexes are identical - hence "full" overlap. + query_count = 2 + join_count = 5 if dropna else 10 a = np.array( [ "foo", @@ -142,8 +204,14 @@ def test_basic_crosstab_with_series_objs_full_overlap(self, dropna): assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) def test_basic_crosstab_with_series_objs_some_overlap(self, dropna): - query_count = 5 - join_count = 13 if dropna else 28 + # In this case, some values are shared across indexes (non-zero intersection), + # hence "some" overlap. + # When a mix of Series and non-Series objects are passed in, the non-Series + # objects are expected to have the same length as the intersection of the indexes + # of the Series objects. This test case passes because we pass in arrays that + # are the length of the intersection rather than the length of each of the Series. + query_count = 2 + join_count = 5 if dropna else 10 a = np.array( [ "foo", @@ -217,8 +285,15 @@ def eval_func(args_list): eval_func, ) - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=1, join_count=1) def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna): + # Same as above - the intersection of the indexes of the Series objects + # is non-zero, but the indexes are not identical - hence "some" overlap. + # When a mix of Series and non-Series objects are passed in, the non-Series + # objects are expected to have the same length as the intersection of the indexes + # of the Series objects. This test case errors because we pass in arrays that + # are the length of the Series, rather than the length of the intersection of + # the indexes of the Series. a = np.array( [ "foo", @@ -296,8 +371,11 @@ def eval_func(args_list): assert_exception_equal=False, # Our error message is a little different. ) - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=1, join_count=1) def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna): + # In this case, no values are shared across the indexes - the intersection is an + # empty set - hence "no" overlap. We error here for the same reason as above - the + # arrays passed in should also be empty, but are non-empty. a = np.array( [ "foo", @@ -376,8 +454,8 @@ def eval_func(args_list): ) def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna): - query_count = 6 - join_count = 3 if dropna else 9 + query_count = 4 + join_count = 1 if dropna else 3 a = native_pd.Series( [ "foo", @@ -456,9 +534,9 @@ def eval_func(args_list): ) def test_margins(self, dropna): - query_count = 3 - join_count = 3 if dropna else 6 - union_count = 3 + query_count = 1 + join_count = 1 if dropna else 2 + union_count = 1 a = np.array( [ "foo", @@ -510,32 +588,26 @@ def test_margins(self, dropna): with SqlCounter( query_count=query_count, join_count=join_count, union_count=union_count ): - native_df = native_pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - margins=True, - margins_name="MARGINS_NAME", - dropna=dropna, - ) - snow_df = pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - margins=True, - margins_name="MARGINS_NAME", - dropna=dropna, + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + margins=True, + margins_name="MARGINS_NAME", + dropna=dropna, + ), ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize(self, dropna, normalize): - query_count = 4 if normalize not in (0, "index") else 3 - join_count = 0 if normalize not in (0, "index") else 3 - if not dropna: - join_count = 9 if normalize in (0, "index") else 4 + query_count = 1 if normalize in (0, "index") else 2 + join_count = 3 if normalize in (0, "index") else 2 + if dropna: + join_count -= 2 a = np.array( [ "foo", @@ -585,30 +657,25 @@ def test_normalize(self, dropna, normalize): dtype=object, ) with SqlCounter(query_count=query_count, join_count=join_count): - native_df = native_pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - dropna=dropna, - ) - snow_df = pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - dropna=dropna, + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + dropna=dropna, + ), ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) def test_normalize_and_margins(self, dropna, normalize): counts = { - "columns": [5, 11 if dropna else 19, 8], - "index": [3, 15 if dropna else 24, 9], - "all": [7, 41 if dropna else 63, 22], + "columns": [3, 5 if dropna else 9, 4], + "index": [1, 5 if dropna else 8, 3], + "all": [3, 12 if dropna else 19, 7], } counts[0] = counts["index"] counts[1] = counts["columns"] @@ -669,30 +736,24 @@ def test_normalize_and_margins(self, dropna, normalize): join_count=sql_counts[1], union_count=sql_counts[2], ): - native_df = native_pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - margins=True, - dropna=dropna, - ) - snow_df = pd.crosstab( - a, - [b, c], - rownames=["a"], - colnames=["b", "c"], - normalize=normalize, - margins=True, - dropna=dropna, + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + normalize=normalize, + margins=True, + dropna=dropna, + ), ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) def test_values(self, dropna, aggfunc): - query_count = 3 - join_count = 6 if dropna else 15 + query_count = 1 + join_count = 2 if dropna else 5 native_df = native_pd.DataFrame( { "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], @@ -711,26 +772,22 @@ def test_values(self, dropna, aggfunc): ) with SqlCounter(query_count=query_count, join_count=join_count): - native_df_result = native_pd.crosstab( - native_df["species"].values, - native_df["favorite_food"].values, - values=native_df["age"].values, - aggfunc=aggfunc, - dropna=dropna, - ) - snow_df = pd.crosstab( - native_df["species"].values, - native_df["favorite_food"].values, - values=native_df["age"].values, - aggfunc=aggfunc, - dropna=dropna, + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc=aggfunc, + dropna=dropna, + ), ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result) @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) def test_values_series_like(self, dropna, aggfunc): - query_count = 9 - join_count = 10 if dropna else 25 + query_count = 5 + join_count = 2 if dropna else 5 native_df = native_pd.DataFrame( { "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], @@ -765,3 +822,70 @@ def test_values_series_like(self, dropna, aggfunc): dropna=dropna, ) assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + + +@sql_count_checker(query_count=0) +def test_values_unsupported_aggfunc(): + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.", + ): + pd.crosstab( + native_df["species"].values, + native_df["favorite_food"].values, + values=native_df["age"].values, + aggfunc="median", + dropna=False, + ) + + +@sql_count_checker(query_count=4) +def test_values_series_like_unsupported_aggfunc(): + # The query count above comes from building the DataFrame + # that we pass in to pivot table. + native_df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + snow_df = pd.DataFrame(native_df) + + with pytest.raises( + NotImplementedError, + match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.", + ): + snow_df = pd.crosstab( + snow_df["species"], + snow_df["favorite_food"], + values=snow_df["age"], + aggfunc="median", + dropna=False, + ) From e8bcc6305451b2ede65c807dcb31b6e3ef01adb1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 27 Aug 2024 20:05:19 -0700 Subject: [PATCH 11/20] Use eval_snowpark... --- tests/integ/modin/crosstab/test_crosstab.py | 68 ++++++++++++--------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 48fa27bd85b..9dfda0a60c0 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -11,10 +11,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker -from tests.integ.modin.utils import ( - assert_snowpark_pandas_equal_to_pandas, - eval_snowpark_pandas_result, -) +from tests.integ.modin.utils import eval_snowpark_pandas_result @pytest.mark.parametrize("dropna", [True, False]) @@ -190,18 +187,23 @@ def test_basic_crosstab_with_series_objs_full_overlap(self, dropna): "shiny", ], ) + + def eval_func(lib): + if lib is pd: + return lib.crosstab( + a, + [lib.Series(b), lib.Series(c)], + rownames=["a"], + colnames=["b", "c"], + dropna=dropna, + ) + else: + return lib.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna + ) + with SqlCounter(query_count=query_count, join_count=join_count): - native_df = native_pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna - ) - snow_df = pd.crosstab( - a, - [pd.Series(b), pd.Series(c)], - rownames=["a"], - colnames=["b", "c"], - dropna=dropna, - ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) + eval_snowpark_pandas_result(pd, native_pd, eval_func) def test_basic_crosstab_with_series_objs_some_overlap(self, dropna): # In this case, some values are shared across indexes (non-zero intersection), @@ -806,22 +808,30 @@ def test_values_series_like(self, dropna, aggfunc): ) snow_df = pd.DataFrame(native_df) + def eval_func(df): + if isinstance(df, pd.DataFrame): + return pd.crosstab( + df["species"], + df["favorite_food"], + values=df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + else: + return native_pd.crosstab( + df["species"], + df["favorite_food"], + values=df["age"], + aggfunc=aggfunc, + dropna=dropna, + ) + with SqlCounter(query_count=query_count, join_count=join_count): - native_df = native_pd.crosstab( - native_df["species"], - native_df["favorite_food"], - values=native_df["age"], - aggfunc=aggfunc, - dropna=dropna, - ) - snow_df = pd.crosstab( - snow_df["species"], - snow_df["favorite_food"], - values=snow_df["age"], - aggfunc=aggfunc, - dropna=dropna, + eval_snowpark_pandas_result( + snow_df, + native_df, + eval_func, ) - assert_snowpark_pandas_equal_to_pandas(snow_df, native_df) @sql_count_checker(query_count=0) From 6399f4ccc16d69da5c0398f29e40b8c1e6408fea Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 27 Aug 2024 20:06:27 -0700 Subject: [PATCH 12/20] Remove crosstab from unsupported tests --- tests/unit/modin/test_unsupported.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py index ad0ae0aa5d6..8b07321a94c 100644 --- a/tests/unit/modin/test_unsupported.py +++ b/tests/unit/modin/test_unsupported.py @@ -45,7 +45,6 @@ def test_unsupported_io(io_method, kwargs): [ ["merge_ordered", {"left": "", "right": ""}], ["value_counts", {"values": ""}], - ["crosstab", {"index": "", "columns": ""}], ["lreshape", {"data": "", "groups": ""}], ["wide_to_long", {"df": "", "stubnames": "", "i": "", "j": ""}], ["to_timedelta", {"arg": ""}], From 53a41253c6776e98438cbf265b74003d93662f7a Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 27 Aug 2024 20:09:50 -0700 Subject: [PATCH 13/20] Fix docs, add all aggfuncs to tests --- docs/source/modin/supported/general_supported.rst | 3 ++- src/snowflake/snowpark/modin/pandas/general.py | 5 +++++ tests/integ/modin/crosstab/test_crosstab.py | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index 0a67e57b39b..d13ee994fb8 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -18,7 +18,8 @@ Data manipulations | ``concat`` | P | ``levels`` is not supported, | | | | | ``copy`` is ignored | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``crosstab`` | Y | | | +| ``crosstab`` | P | | ``N`` if ``aggfunc`` is not one of | +| | | | "count", "mean", "min", "max", or "sum" | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``cut`` | P | ``retbins``, ``labels`` | ``N`` if ``retbins=True``or ``labels!=False`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 877ad4578d9..69a10cc08c0 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2031,6 +2031,11 @@ def crosstab( DataFrame Cross tabulation of the data. + Notes + ----- + + Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum". + Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 9dfda0a60c0..562f8eb506d 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -752,7 +752,7 @@ def test_normalize_and_margins(self, dropna, normalize): ), ) - @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_values(self, dropna, aggfunc): query_count = 1 join_count = 2 if dropna else 5 @@ -786,7 +786,7 @@ def test_values(self, dropna, aggfunc): ), ) - @pytest.mark.parametrize("aggfunc", ["mean", "sum"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_values_series_like(self, dropna, aggfunc): query_count = 5 join_count = 2 if dropna else 5 From 76c01e2ab2a2986d59534603a82c06e61ee5ef9b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 27 Aug 2024 20:31:15 -0700 Subject: [PATCH 14/20] Fix docs --- .../modin/supported/general_supported.rst | 4 +- .../snowpark/modin/pandas/general.py | 12 +- tests/integ/modin/crosstab/test_crosstab.py | 163 ++++++++++++++++++ 3 files changed, 177 insertions(+), 2 deletions(-) diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst index d13ee994fb8..cfd6d2d4c5a 100644 --- a/docs/source/modin/supported/general_supported.rst +++ b/docs/source/modin/supported/general_supported.rst @@ -19,7 +19,9 @@ Data manipulations | | | ``copy`` is ignored | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``crosstab`` | P | | ``N`` if ``aggfunc`` is not one of | -| | | | "count", "mean", "min", "max", or "sum" | +| | | | "count", "mean", "min", "max", or "sum", or | +| | | | margins is True, normalize is "all" or True, | +| | | | and values is passed. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``cut`` | P | ``retbins``, ``labels`` | ``N`` if ``retbins=True``or ``labels!=False`` | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 69a10cc08c0..4bd70492c2f 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2034,7 +2034,8 @@ def crosstab( Notes ----- - Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum". + Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum", or + margins is True, normalize is True or all, and values is passed. Examples -------- @@ -2063,6 +2064,15 @@ def crosstab( if not is_nested_list_like(columns): columns = [columns] + if ( + values is not None + and margins is True + and (normalize is True or normalize == "all") + ): + raise NotImplementedError( + 'Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.' + ) + user_passed_rownames = rownames is not None user_passed_colnames = colnames is not None diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 562f8eb506d..e30a4c2b4db 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -752,6 +752,169 @@ def test_normalize_and_margins(self, dropna, normalize): ), ) + @pytest.mark.parametrize("normalize", [0, 1, "index", "columns"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_normalize_margins_and_values(self, dropna, normalize, aggfunc): + counts = { + "columns": [3, 29 if dropna else 41, 4], + "index": [1, 23 if dropna else 32, 3], + "all": [3, 54 if dropna else 75, 7], + } + counts[0] = counts["index"] + counts[1] = counts["columns"] + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + if aggfunc == "sum": + # For some reason, the rounding is different for sum. + df = df.round(decimals=6) + return df + + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + union_count=sql_counts[2], + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + + @pytest.mark.parametrize("normalize", ["all", True]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + @sql_count_checker(query_count=0) + def test_normalize_margins_and_values_not_supported( + self, dropna, normalize, aggfunc + ): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + with pytest.raises( + NotImplementedError, + match='Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.', + ): + pd.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) def test_values(self, dropna, aggfunc): query_count = 1 From 906e00b64f7f8efa7ebeef8bfadc8e10d973249b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 28 Aug 2024 14:56:10 -0700 Subject: [PATCH 15/20] Add tests --- tests/integ/modin/crosstab/test_crosstab.py | 168 +++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index e30a4c2b4db..2c156630ad8 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -829,7 +829,7 @@ def eval_func(lib): aggfunc=aggfunc, ) if aggfunc == "sum": - # For some reason, the rounding is different for sum. + # Thanks to our hack, the rounding is different for sum. df = df.round(decimals=6) return df @@ -844,6 +844,172 @@ def eval_func(lib): eval_func, ) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_margins_and_values(self, dropna, aggfunc): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + margins=True, + dropna=dropna, + aggfunc=aggfunc, + ) + return df + + with SqlCounter( + query_count=1, + join_count=7 if dropna else 10, + union_count=1, + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + + @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) + @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) + def test_normalize_and_values(self, dropna, normalize, aggfunc): + counts = { + "columns": [2, 4 if dropna else 10], + "index": [1, 5 if dropna else 11], + "all": [2, 4 if dropna else 10], + } + counts[0] = counts["index"] + counts[1] = counts["columns"] + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) + if normalize is True: + sql_counts = counts["all"] + else: + sql_counts = counts[normalize] + + def eval_func(lib): + df = lib.crosstab( + a, + [b, c], + rownames=["a"], + colnames=["b", "c"], + values=vals, + normalize=normalize, + dropna=dropna, + aggfunc=aggfunc, + ) + if aggfunc in ["sum", "max"]: + # Thanks to our hack, the rounding is different for sum and max. + df = df.round(decimals=6) + return df + + with SqlCounter( + query_count=sql_counts[0], + join_count=sql_counts[1], + ): + eval_snowpark_pandas_result( + pd, + native_pd, + eval_func, + ) + @pytest.mark.parametrize("normalize", ["all", True]) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) @sql_count_checker(query_count=0) From 5cdc71210f546e0d99459c415f8f327a9ffdb5a5 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 28 Aug 2024 16:18:29 -0700 Subject: [PATCH 16/20] Address review comments --- .../snowpark/modin/pandas/general.py | 9 + tests/integ/modin/crosstab/test_crosstab.py | 774 ++---------------- tests/integ/modin/crosstab/test_utils.py | 91 ++ 3 files changed, 151 insertions(+), 723 deletions(-) create mode 100644 tests/integ/modin/crosstab/test_utils.py diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 4bd70492c2f..e0afb909fa9 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2282,6 +2282,15 @@ def _get_names_wrapper(list_of_objs, names, prefix): elif normalize == "all": # Normalize core f = normalizers[normalize] + + # When we perform the normalization function, we take the sum over + # the rows, and divide every value by the sum. Since margins is included + # though, the result of the sum is actually 2 * the sum of the original + # values (since the margin itself is the sum of the original values), + # so we need to multiply by 2 here to account for that. + # The alternative would be to apply normalization to the main table + # and the index margins separately, but that would require additional joins + # to get the final table, which we want to avoid. table = f(table.iloc[:, :-1]) * 2.0 column_margin = column_margin / column_margin.sum() diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 2c156630ad8..8c228e895da 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -16,57 +16,9 @@ @pytest.mark.parametrize("dropna", [True, False]) class TestCrosstab: - def test_basic_crosstab_with_numpy_arrays(self, dropna): + def test_basic_crosstab_with_numpy_arrays(self, dropna, a, b, c): query_count = 1 join_count = 0 if dropna else 1 - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) with SqlCounter(query_count=query_count, join_count=join_count): eval_snowpark_pandas_result( pd, @@ -76,49 +28,10 @@ def test_basic_crosstab_with_numpy_arrays(self, dropna): ), ) - def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna): - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) + def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b, c): + a = a[:-1] + b = b[:-2] + c = c[:-3] with SqlCounter(query_count=0): eval_snowpark_pandas_result( pd, @@ -137,56 +50,10 @@ def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna): # only the intersection of the index objects of all Series when determining # the final DataFrame to pass into pivot_table, so here, we are testing # that we follow that behavior. - def test_basic_crosstab_with_series_objs_full_overlap(self, dropna): + def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c): # In this case, all indexes are identical - hence "full" overlap. query_count = 2 join_count = 5 if dropna else 10 - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = native_pd.Series( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - ) - c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - ) def eval_func(lib): if lib is pd: @@ -205,7 +72,7 @@ def eval_func(lib): with SqlCounter(query_count=query_count, join_count=join_count): eval_snowpark_pandas_result(pd, native_pd, eval_func) - def test_basic_crosstab_with_series_objs_some_overlap(self, dropna): + def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c): # In this case, some values are shared across indexes (non-zero intersection), # hence "some" overlap. # When a mix of Series and non-Series objects are passed in, the non-Series @@ -214,52 +81,12 @@ def test_basic_crosstab_with_series_objs_some_overlap(self, dropna): # are the length of the intersection rather than the length of each of the Series. query_count = 2 join_count = 5 if dropna else 10 - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) b = native_pd.Series( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], + b, index=list(range(len(a))), ) c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], + c, index=-1 * np.array(list(range(len(a)))), ) @@ -288,7 +115,7 @@ def eval_func(args_list): ) @sql_count_checker(query_count=1, join_count=1) - def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna): + def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna, a, b, c): # Same as above - the intersection of the indexes of the Series objects # is non-zero, but the indexes are not identical - hence "some" overlap. # When a mix of Series and non-Series objects are passed in, the non-Series @@ -296,52 +123,12 @@ def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna): # of the Series objects. This test case errors because we pass in arrays that # are the length of the Series, rather than the length of the intersection of # the indexes of the Series. - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) b = native_pd.Series( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], + b, index=list(range(len(a))), ) c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], + c, index=-1 * np.array(list(range(len(a)))), ) @@ -374,56 +161,16 @@ def eval_func(args_list): ) @sql_count_checker(query_count=1, join_count=1) - def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna): + def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna, a, b, c): # In this case, no values are shared across the indexes - the intersection is an # empty set - hence "no" overlap. We error here for the same reason as above - the # arrays passed in should also be empty, but are non-empty. - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) b = native_pd.Series( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], + b, index=list(range(len(a))), ) c = native_pd.Series( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], + c, index=-1 - np.array(list(range(len(a)))), ) @@ -455,53 +202,19 @@ def eval_func(args_list): assert_exception_equal=False, # Our error message is a little different. ) - def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna): + def test_basic_crosstab_with_df_and_series_objs_pandas_errors( + self, dropna, a, b, c + ): query_count = 4 join_count = 1 if dropna else 3 a = native_pd.Series( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], + a, dtype=object, ) b = native_pd.DataFrame( { - "0": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "1": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], + "0": b, + "1": c, } ) # pandas expects only Series objects, or DataFrames that have only a single column, while @@ -535,58 +248,11 @@ def eval_func(args_list): eval_func, ) - def test_margins(self, dropna): + def test_margins(self, dropna, a, b, c): query_count = 1 join_count = 1 if dropna else 2 union_count = 1 - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) + with SqlCounter( query_count=query_count, join_count=join_count, union_count=union_count ): @@ -605,59 +271,12 @@ def test_margins(self, dropna): ) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) - def test_normalize(self, dropna, normalize): + def test_normalize(self, dropna, normalize, a, b, c): query_count = 1 if normalize in (0, "index") else 2 join_count = 3 if normalize in (0, "index") else 2 if dropna: join_count -= 2 - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) + with SqlCounter(query_count=query_count, join_count=join_count): eval_snowpark_pandas_result( pd, @@ -673,7 +292,7 @@ def test_normalize(self, dropna, normalize): ) @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) - def test_normalize_and_margins(self, dropna, normalize): + def test_normalize_and_margins(self, dropna, normalize, a, b, c): counts = { "columns": [3, 5 if dropna else 9, 4], "index": [1, 5 if dropna else 8, 3], @@ -681,54 +300,7 @@ def test_normalize_and_margins(self, dropna, normalize): } counts[0] = counts["index"] counts[1] = counts["columns"] - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) + if normalize is True: sql_counts = counts["all"] else: @@ -754,7 +326,7 @@ def test_normalize_and_margins(self, dropna, normalize): @pytest.mark.parametrize("normalize", [0, 1, "index", "columns"]) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) - def test_normalize_margins_and_values(self, dropna, normalize, aggfunc): + def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c): counts = { "columns": [3, 29 if dropna else 41, 4], "index": [1, 23 if dropna else 32, 3], @@ -762,54 +334,6 @@ def test_normalize_margins_and_values(self, dropna, normalize, aggfunc): } counts[0] = counts["index"] counts[1] = counts["columns"] - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) if normalize is True: sql_counts = counts["all"] @@ -829,7 +353,12 @@ def eval_func(lib): aggfunc=aggfunc, ) if aggfunc == "sum": - # Thanks to our hack, the rounding is different for sum. + # When normalizing the data, we apply the normalization function to the + # entire table (including margins), which requires us to multiply by 2 + # (since the function takes the sum over the rows, and the margins row is + # itself the sum over the rows, causing the sum over all rows to be equal + # to 2 * the sum over the input rows). This hack allows us to save on joins + # but results in slight precision issues. df = df.round(decimals=6) return df @@ -845,55 +374,7 @@ def eval_func(lib): ) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) - def test_margins_and_values(self, dropna, aggfunc): - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) + def test_margins_and_values(self, dropna, aggfunc, a, b, c): vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) def eval_func(lib): @@ -922,7 +403,7 @@ def eval_func(lib): @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"]) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) - def test_normalize_and_values(self, dropna, normalize, aggfunc): + def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c): counts = { "columns": [2, 4 if dropna else 10], "index": [1, 5 if dropna else 11], @@ -930,54 +411,6 @@ def test_normalize_and_values(self, dropna, normalize, aggfunc): } counts[0] = counts["index"] counts[1] = counts["columns"] - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) if normalize is True: sql_counts = counts["all"] @@ -996,7 +429,12 @@ def eval_func(lib): aggfunc=aggfunc, ) if aggfunc in ["sum", "max"]: - # Thanks to our hack, the rounding is different for sum and max. + # When normalizing the data, we apply the normalization function to the + # entire table (including margins), which requires us to multiply by 2 + # (since the function takes the sum over the rows, and the margins row is + # itself the sum over the rows, causing the sum over all rows to be equal + # to 2 * the sum over the input rows). This hack allows us to save on joins + # but results in slight precision issues. df = df.round(decimals=6) return df @@ -1014,56 +452,8 @@ def eval_func(lib): @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) @sql_count_checker(query_count=0) def test_normalize_margins_and_values_not_supported( - self, dropna, normalize, aggfunc + self, dropna, normalize, aggfunc, a, b, c ): - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0]) with pytest.raises( NotImplementedError, @@ -1082,25 +472,10 @@ def test_normalize_margins_and_values_not_supported( ) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) - def test_values(self, dropna, aggfunc): + def test_values(self, dropna, aggfunc, basic_crosstab_dfs): query_count = 1 join_count = 2 if dropna else 5 - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", - ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], - } - ) + native_df = basic_crosstab_dfs[0] with SqlCounter(query_count=query_count, join_count=join_count): eval_snowpark_pandas_result( @@ -1116,26 +491,10 @@ def test_values(self, dropna, aggfunc): ) @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"]) - def test_values_series_like(self, dropna, aggfunc): + def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs): query_count = 5 join_count = 2 if dropna else 5 - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", - ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], - } - ) - snow_df = pd.DataFrame(native_df) + native_df, snow_df = basic_crosstab_dfs def eval_func(df): if isinstance(df, pd.DataFrame): @@ -1164,23 +523,8 @@ def eval_func(df): @sql_count_checker(query_count=0) -def test_values_unsupported_aggfunc(): - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", - ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], - } - ) +def test_values_unsupported_aggfunc(basic_crosstab_dfs): + native_df = basic_crosstab_dfs[0] with pytest.raises( NotImplementedError, @@ -1196,26 +540,10 @@ def test_values_unsupported_aggfunc(): @sql_count_checker(query_count=4) -def test_values_series_like_unsupported_aggfunc(): +def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs): # The query count above comes from building the DataFrame # that we pass in to pivot table. - native_df = native_pd.DataFrame( - { - "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], - "favorite_food": [ - "chicken", - "fish", - "fish", - "beef", - "chicken", - "beef", - "fish", - "beef", - ], - "age": [7, 2, 8, 5, 9, 3, 6, 1], - } - ) - snow_df = pd.DataFrame(native_df) + _, snow_df = basic_crosstab_dfs with pytest.raises( NotImplementedError, diff --git a/tests/integ/modin/crosstab/test_utils.py b/tests/integ/modin/crosstab/test_utils.py new file mode 100644 index 00000000000..6203419321d --- /dev/null +++ b/tests/integ/modin/crosstab/test_utils.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# + +import modin.pandas as pd +import numpy as np +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 + + +@pytest.fixture(scope="function") +def a(): + return np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def b(): + return np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def c(): + return np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + + +@pytest.fixture(scope="function") +def basic_crosstab_dfs(): + df = native_pd.DataFrame( + { + "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"], + "favorite_food": [ + "chicken", + "fish", + "fish", + "beef", + "chicken", + "beef", + "fish", + "beef", + ], + "age": [7, 2, 8, 5, 9, 3, 6, 1], + } + ) + return df, pd.DataFrame(df) From a38bcb1c9650418e2db7679a5ba45c3e206c237a Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 28 Aug 2024 16:20:00 -0700 Subject: [PATCH 17/20] Address review comments --- tests/integ/modin/crosstab/{test_utils.py => conftest.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/integ/modin/crosstab/{test_utils.py => conftest.py} (100%) diff --git a/tests/integ/modin/crosstab/test_utils.py b/tests/integ/modin/crosstab/conftest.py similarity index 100% rename from tests/integ/modin/crosstab/test_utils.py rename to tests/integ/modin/crosstab/conftest.py From e835540ccc0ec50ba460d36c09d3c1a2c11f6aa8 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Wed, 28 Aug 2024 20:36:30 -0700 Subject: [PATCH 18/20] Fix doc --- src/snowflake/snowpark/modin/pandas/general.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index e0afb909fa9..237389bbaac 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2028,7 +2028,7 @@ def crosstab( Returns ------- - DataFrame + Snowpark pandas :class:`~snowflake.snowpark.modin.pandas.DataFrame` Cross tabulation of the data. Notes From 4e7464cedcb9e4d9ce818295cd827ac983f5b950 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 30 Aug 2024 14:10:23 -0700 Subject: [PATCH 19/20] Update coverage --- tests/integ/modin/crosstab/test_crosstab.py | 83 ++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 8c228e895da..61d35575185 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -202,7 +202,7 @@ def eval_func(args_list): assert_exception_equal=False, # Our error message is a little different. ) - def test_basic_crosstab_with_df_and_series_objs_pandas_errors( + def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns( self, dropna, a, b, c ): query_count = 4 @@ -248,6 +248,52 @@ def eval_func(args_list): eval_func, ) + def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index( + self, dropna, a, b, c + ): + query_count = 4 + join_count = 1 if dropna else 3 + a = native_pd.Series( + a, + dtype=object, + ) + b = native_pd.DataFrame( + { + "0": b, + "1": c, + } + ) + # pandas expects only Series objects, or DataFrames that have only a single column, while + # we support accepting DataFrames with multiple columns. + with pytest.raises( + AssertionError, match="arrays and names must have the same length" + ): + native_pd.crosstab(b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna) + + def eval_func(args_list): + a, b = args_list + if isinstance(a, native_pd.Series): + return native_pd.crosstab( + [b[c] for c in b.columns], + a, + rownames=["a", "b"], + colnames=["c"], + dropna=dropna, + ) + else: + return pd.crosstab( + b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna + ) + + with SqlCounter(query_count=query_count, join_count=join_count): + native_args = [a, b] + snow_args = [pd.Series(a), pd.DataFrame(b)] + eval_snowpark_pandas_result( + snow_args, + native_args, + eval_func, + ) + def test_margins(self, dropna, a, b, c): query_count = 1 join_count = 1 if dropna else 2 @@ -556,3 +602,38 @@ def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs): aggfunc="median", dropna=False, ) + + +@sql_count_checker(query_count=0) +def test_values_aggfunc_one_supplied_should_error(a, b, c): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, aggfunc="sum"), + expect_exception=True, + expect_exception_match="aggfunc cannot be used without values.", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, values=c), + expect_exception=True, + expect_exception_match="values cannot be used without an aggfunc.", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) + + +@sql_count_checker(query_count=0) +def test_invalid_normalize(a, b): + eval_snowpark_pandas_result( + pd, + native_pd, + lambda lib: lib.crosstab(index=a, columns=b, normalize="invalid_value"), + expect_exception=True, + expect_exception_match="Not a valid normalize argument: invalid_value", + expect_exception_type=ValueError, + assert_exception_equal=True, + ) From 1430d1b50458fca5441d27ba37b664d4a426846e Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Fri, 30 Aug 2024 14:48:37 -0700 Subject: [PATCH 20/20] Fix tests --- src/snowflake/snowpark/modin/pandas/general.py | 2 +- tests/integ/modin/crosstab/test_crosstab.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index fb3822e5d54..df19e9eac91 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2235,7 +2235,7 @@ def _get_names_wrapper(list_of_objs, names, prefix): # as a valid value of normalize is `0` (for normalizing index). if normalize is not False: if normalize not in [0, 1, "index", "columns", "all", True]: - raise ValueError(f"Not a valid normalize argument: {normalize}") + raise ValueError("Not a valid normalize argument") if normalize is True: normalize = "all" normalize = {0: "index", 1: "columns"}.get(normalize, normalize) diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py index 61d35575185..276650519d9 100644 --- a/tests/integ/modin/crosstab/test_crosstab.py +++ b/tests/integ/modin/crosstab/test_crosstab.py @@ -251,8 +251,8 @@ def eval_func(args_list): def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index( self, dropna, a, b, c ): - query_count = 4 - join_count = 1 if dropna else 3 + query_count = 6 + join_count = 5 if dropna else 17 a = native_pd.Series( a, dtype=object, @@ -633,7 +633,7 @@ def test_invalid_normalize(a, b): native_pd, lambda lib: lib.crosstab(index=a, columns=b, normalize="invalid_value"), expect_exception=True, - expect_exception_match="Not a valid normalize argument: invalid_value", + expect_exception_match="Not a valid normalize argument", expect_exception_type=ValueError, assert_exception_equal=True, )