From 3ce073a47be83e25aa568bed4f44fe0150e6ba9b Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 25 Jun 2024 16:02:05 -0700
Subject: [PATCH 01/20] [SNOW-1502893]: Add support for `pd.crosstab`

---
 CHANGELOG.md                                  |   1 +
 docs/source/modin/general_functions.rst       |   1 +
 .../modin/supported/general_supported.rst     |   2 +-
 .../snowpark/modin/pandas/general.py          | 145 ++++++++++++++++--
 4 files changed, 132 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0d34d53844a..7de54de7a88 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -70,6 +70,7 @@
 - Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`.
 - Added support for `Series.dt.isocalendar`.
 - Added support for `Series.case_when` except when condition or replacement is callable.
+- Added support for `pd.crosstab`.
 
 #### Bug Fixes
 
diff --git a/docs/source/modin/general_functions.rst b/docs/source/modin/general_functions.rst
index 737033e2190..40b6440473b 100644
--- a/docs/source/modin/general_functions.rst
+++ b/docs/source/modin/general_functions.rst
@@ -11,6 +11,7 @@ General functions
     :toctree: pandas_api/
 
     melt
+    crosstab
     pivot_table
     cut
     qcut
diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
index 32c1142bb70..899c3c0e93f 100644
--- a/docs/source/modin/supported/general_supported.rst
+++ b/docs/source/modin/supported/general_supported.rst
@@ -18,7 +18,7 @@ Data manipulations
 | ``concat``                  | P                               | ``levels`` is not supported,     |                                                    |
 |                             |                                 | ``copy`` is ignored              |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``crosstab``                | N                               |                                  |                                                    |
+| ``crosstab``                | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``cut``                     | P                               | ``retbins``, ``labels``          | ``N`` if ``retbins=True``or ``labels!=False``      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index bf0667e6365..8388c381b9c 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -22,7 +22,7 @@
 """Implement pandas general API."""
 from __future__ import annotations
 
-from collections.abc import Hashable, Iterable, Mapping, Sequence
+from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
 from datetime import date, datetime, tzinfo
 from logging import getLogger
 from typing import TYPE_CHECKING, Any, Literal, Union
@@ -47,7 +47,7 @@
     _infer_tz_from_endpoints,
     _maybe_normalize_endpoints,
 )
-from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.common import is_list_like, is_nested_list_like
 from pandas.core.dtypes.inference import is_array_like
 from pandas.core.tools.datetimes import (
     ArrayConvertible,
@@ -1769,7 +1769,6 @@ def melt(
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
-@pandas_module_level_function_not_implemented()
 @_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab")
 def crosstab(
     index,
@@ -1786,20 +1785,134 @@ def crosstab(
     """
     Compute a simple cross tabulation of two (or more) factors.
     """
-    # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
-    pandas_crosstab = pandas.crosstab(
-        index,
-        columns,
-        values,
-        rownames,
-        colnames,
-        aggfunc,
-        margins,
-        margins_name,
-        dropna,
-        normalize,
+    if values is None and aggfunc is not None:
+        raise ValueError("aggfunc cannot be used without values.")
+
+    if values is not None and aggfunc is None:
+        raise ValueError("values cannot be used without an aggfunc.")
+
+    if not is_nested_list_like(index):
+        index = [index]
+    if not is_nested_list_like(columns):
+        columns = [columns]
+
+    from pandas.core.reshape.pivot import _build_names_mapper, _get_names
+
+    rownames = _get_names(index, rownames, prefix="row")
+    colnames = _get_names(columns, colnames, prefix="col")
+
+    (
+        rownames_mapper,
+        unique_rownames,
+        colnames_mapper,
+        unique_colnames,
+    ) = _build_names_mapper(rownames, colnames)
+
+    common_idx = None
+    pass_objs = [x for x in index + columns if isinstance(x, (Series, DataFrame))]
+    if pass_objs:
+        if len(pass_objs) == 1:
+            common_idx = pass_objs[0]
+        else:
+            common_idx = pass_objs[0].intersection(pass_objs[1:])
+
+    data = {
+        **dict(zip(unique_rownames, index)),
+        **dict(zip(unique_colnames, columns)),
+    }
+    df = DataFrame(data, index=common_idx)
+
+    if values is None:
+        df["__dummy__"] = 0
+        kwargs = {"aggfunc": "count"}
+    else:
+        df["__dummy__"] = values
+        kwargs = {"aggfunc": aggfunc}
+
+    table = df.pivot_table(
+        "__dummy__",
+        index=unique_rownames,
+        columns=unique_colnames,
+        margins=margins,
+        margins_name=margins_name,
+        dropna=dropna,
+        # observed=dropna,
+        **kwargs,  # type: ignore[arg-type]
     )
-    return DataFrame(pandas_crosstab)
+
+    if aggfunc is None:
+        # If no aggfunc is provided, we are computing frequencies. Since we use
+        # pivot_table above, pairs that are not observed will get a NaN value,
+        # so we need to fill all NaN values with 0.
+        table = table.fillna(0)
+
+    # We must explicitly check that the value of normalize is not False here,
+    # as a valid value of normalize is `0` (for normalizing index).
+    if normalize is not False:
+        if normalize not in [0, 1, "index", "columns", "all", True]:
+            raise ValueError(f"Not a valid normalize argument: {normalize}")
+        if normalize is True:
+            normalize = "all"
+        normalize = {0: "index", 1: "columns"}.get(normalize, normalize)
+
+        # Actual Normalizations
+        normalizers: dict[bool | str, Callable] = {
+            "all": lambda x: x / x.sum(axis=0).sum(),
+            "columns": lambda x: x / x.sum(),
+            "index": lambda x: x.div(x.sum(axis=1), axis=0),
+        }
+
+        if margins is False:
+
+            f = normalizers[normalize]
+
+            table = f(table)
+            table = table.fillna(0)
+        else:
+            # keep index and column of pivoted table
+            table_index = table.index
+            table_columns = table.columns
+
+            column_margin = table.iloc[:-1, -1]
+            index_margin = table.iloc[-1, :-1]
+
+            # keep the core table
+            table = table.iloc[:-1, :-1]
+
+            # Normalize core
+            f = normalizers[normalize]
+
+            table = f(table)
+            table = table.fillna(0)
+
+            # Fix Margins
+            if normalize == "columns":
+                column_margin = column_margin / column_margin.sum()
+                table = pd.concat([table, column_margin], axis=1)
+                table = table.fillna(0)
+                table.columns = table_columns
+
+            elif normalize == "index":
+                index_margin = index_margin / index_margin.sum()
+                table = table.append(index_margin, ignore_index=True)
+                table = table.fillna(0)
+                table.index = table_index
+
+            elif normalize == "all":
+                column_margin = column_margin / column_margin.sum()
+                index_margin = index_margin / index_margin.sum()
+                index_margin.loc[margins_name] = 1
+                table = pd.concat([table, column_margin], axis=1)
+                table = table.append(index_margin, ignore_index=True)
+
+                table = table.fillna(0)
+                table.index = table_index
+                table.columns = table_columns
+
+    table = table.rename_axis(index=rownames_mapper, axis=0)
+    table = table.rename_axis(columns=colnames_mapper, axis=1)
+
+    return table
 
 
 # Adding docstring since pandas docs don't have web section for this function.

From d9e5b798ddf06b5065f2e19467267fa168610463 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 13 Aug 2024 10:55:52 -0700
Subject: [PATCH 02/20] Add initial tests + implementation

---
 src/snowflake/snowpark/modin/pandas/base.py   |   1 -
 .../snowpark/modin/pandas/general.py          |   3 +-
 .../compiler/snowflake_query_compiler.py      |   2 +-
 tests/integ/modin/crosstab/__init__.py        |   3 +
 tests/integ/modin/crosstab/test_crosstab.py   | 124 ++++++++++++++++++
 5 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 tests/integ/modin/crosstab/__init__.py
 create mode 100644 tests/integ/modin/crosstab/test_crosstab.py

diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py
index 43ed5ee389f..486fb8296ee 100644
--- a/src/snowflake/snowpark/modin/pandas/base.py
+++ b/src/snowflake/snowpark/modin/pandas/base.py
@@ -804,7 +804,6 @@ def aggregate(
         # TypeError: got an unexpected keyword argument 'skipna'
         if is_dict_like(func) and not uses_named_kwargs:
             kwargs.clear()
-
         result = self.__constructor__(
             query_compiler=self._query_compiler.agg(
                 func=func,
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 2ae9734fc8a..ece3de5b6fb 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1927,8 +1927,9 @@ def crosstab(
         if margins is False:
 
             f = normalizers[normalize]
-
+            names = table.columns.names
             table = f(table)
+            table.columns.names = names
             table = table.fillna(0)
         else:
             # keep index and column of pivoted table
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 37e04e3643b..00ee487e5bc 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -13863,7 +13863,7 @@ def infer_sorted_column_labels(
         new_frame = InternalFrame.create(
             ordered_dataframe=expanded_ordered_frame,
             data_column_pandas_labels=expanded_data_column_pandas_labels,
-            data_column_pandas_index_names=[None],  # operation removes names
+            data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names,
             data_column_snowflake_quoted_identifiers=expanded_data_column_snowflake_quoted_identifiers,
             index_column_pandas_labels=index_column_pandas_labels,
             index_column_snowflake_quoted_identifiers=frame.index_column_snowflake_quoted_identifiers,
diff --git a/tests/integ/modin/crosstab/__init__.py b/tests/integ/modin/crosstab/__init__.py
new file mode 100644
index 00000000000..0fbef920926
--- /dev/null
+++ b/tests/integ/modin/crosstab/__init__.py
@@ -0,0 +1,3 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
new file mode 100644
index 00000000000..cfa254ef989
--- /dev/null
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -0,0 +1,124 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+
+import modin.pandas as pd
+import numpy as np
+import pandas as native_pd
+import pytest
+
+import snowflake.snowpark.modin.plugin  # noqa: F401
+from tests.integ.modin.sql_counter import sql_count_checker
+from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas
+
+
+@sql_count_checker(query_count=4, join_count=1)
+def test_basic_crosstab():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = np.array(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        dtype=object,
+    )
+    c = np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+    native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+    snow_df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=4, join_count=4, union_count=3)
+def test_basic_crosstab_margins():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = np.array(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        dtype=object,
+    )
+    c = np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+    native_df = native_pd.crosstab(
+        a,
+        [b, c],
+        rownames=["a"],
+        colnames=["b", "c"],
+        margins=True,
+        margins_name="MARGINS_NAME",
+    )
+    snow_df = pd.crosstab(
+        a,
+        [b, c],
+        rownames=["a"],
+        colnames=["b", "c"],
+        margins=True,
+        margins_name="MARGINS_NAME",
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=5, join_count=1)
+@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
+def test_basic_crosstab_normalize(normalize):
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = np.array(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        dtype=object,
+    )
+    c = np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+    native_df = native_pd.crosstab(
+        a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+    )
+    snow_df = pd.crosstab(
+        a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)

From b883db730e9fb385fd1d788df36d007ef5418d3b Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 13 Aug 2024 12:02:28 -0700
Subject: [PATCH 03/20] Add values tests

---
 .../snowpark/modin/pandas/general.py          |   4 +-
 tests/integ/modin/crosstab/test_crosstab.py   | 119 +++++++++++++++++-
 2 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index ece3de5b6fb..1fca0aa5e94 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1876,7 +1876,9 @@ def crosstab(
         if len(pass_objs) == 1:
             common_idx = pass_objs[0]
         else:
-            common_idx = pass_objs[0].intersection(pass_objs[1:])
+            common_idx = pass_objs[0].index.intersection(
+                [obj.index for obj in pass_objs[1:]]
+            )
 
     data = {
         **dict(zip(unique_rownames, index)),
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index cfa254ef989..93c40c2691c 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -44,7 +44,7 @@ def test_basic_crosstab():
 
 
 @sql_count_checker(query_count=4, join_count=4, union_count=3)
-def test_basic_crosstab_margins():
+def test_margins():
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
         dtype=object,
@@ -90,7 +90,7 @@ def test_basic_crosstab_margins():
 
 @sql_count_checker(query_count=5, join_count=1)
 @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
-def test_basic_crosstab_normalize(normalize):
+def test_normalize(normalize):
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
         dtype=object,
@@ -122,3 +122,118 @@ def test_basic_crosstab_normalize(normalize):
         a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
     )
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=6, join_count=12, union_count=8)
+@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
+def test_normalize_and_margins(normalize):
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = np.array(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        dtype=object,
+    )
+    c = np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+    native_df = native_pd.crosstab(
+        a,
+        [b, c],
+        rownames=["a"],
+        colnames=["b", "c"],
+        normalize=normalize,
+        margins=True,
+    )
+    snow_df = pd.crosstab(
+        a,
+        [b, c],
+        rownames=["a"],
+        colnames=["b", "c"],
+        normalize=normalize,
+        margins=True,
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=4, join_count=7)
+@pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+def test_values(aggfunc):
+    native_df = native_pd.DataFrame(
+        {
+            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+            "favorite_food": [
+                "chicken",
+                "fish",
+                "fish",
+                "beef",
+                "chicken",
+                "beef",
+                "fish",
+                "beef",
+            ],
+            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+        }
+    )
+    native_df_result = native_pd.crosstab(
+        native_df["species"].values,
+        native_df["favorite_food"].values,
+        values=native_df["age"].values,
+        aggfunc=aggfunc,
+    )
+    snow_df = pd.crosstab(
+        native_df["species"].values,
+        native_df["favorite_food"].values,
+        values=native_df["age"].values,
+        aggfunc=aggfunc,
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result)
+
+
+@sql_count_checker(query_count=4, join_count=7)
+@pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+def test_values_series_like(aggfunc):
+    native_df = native_pd.DataFrame(
+        {
+            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+            "favorite_food": [
+                "chicken",
+                "fish",
+                "fish",
+                "beef",
+                "chicken",
+                "beef",
+                "fish",
+                "beef",
+            ],
+            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+        }
+    )
+    snow_df = pd.DataFrame(native_df)
+    native_df = native_pd.crosstab(
+        native_df["species"],
+        native_df["favorite_food"],
+        values=native_df["age"],
+        aggfunc=aggfunc,
+    )
+    snow_df = pd.crosstab(
+        snow_df["species"],
+        snow_df["favorite_food"],
+        values=snow_df["age"],
+        aggfunc=aggfunc,
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)

From 794b592bbf063e12b36d71a194a9e3183d9c7bbe Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 20 Aug 2024 14:55:28 -0700
Subject: [PATCH 04/20] Add more support

---
 .../snowpark/modin/pandas/general.py          | 124 +++++++--
 .../compiler/snowflake_query_compiler.py      |  13 +-
 tests/integ/modin/crosstab/test_crosstab.py   | 259 +++++++++++++++++-
 3 files changed, 372 insertions(+), 24 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 1fca0aa5e94..51787b54d53 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1858,10 +1858,28 @@ def crosstab(
     if not is_nested_list_like(columns):
         columns = [columns]
 
+    user_passed_rownames = rownames is not None
+    user_passed_colnames = colnames is not None
+
     from pandas.core.reshape.pivot import _build_names_mapper, _get_names
 
-    rownames = _get_names(index, rownames, prefix="row")
-    colnames = _get_names(columns, colnames, prefix="col")
+    def _get_names_wrapper(list_of_objs, names, prefix):
+        """
+        Helper method to expand DataFrame objects containing
+        multiple columns into Series, since `_get_names` expects
+        one column per entry.
+        """
+        expanded_list_of_objs = []
+        for obj in list_of_objs:
+            if isinstance(obj, DataFrame):
+                for col in obj.columns:
+                    expanded_list_of_objs.append(obj[col])
+            else:
+                expanded_list_of_objs.append(obj)
+        return _get_names(expanded_list_of_objs, names, prefix)
+
+    rownames = _get_names_wrapper(index, rownames, prefix="row")
+    colnames = _get_names_wrapper(columns, colnames, prefix="col")
 
     (
         rownames_mapper,
@@ -1870,21 +1888,89 @@ def crosstab(
         unique_colnames,
     ) = _build_names_mapper(rownames, colnames)
 
-    common_idx = None
     pass_objs = [x for x in index + columns if isinstance(x, (Series, DataFrame))]
+    row_idx_names = None
+    col_idx_names = None
     if pass_objs:
-        if len(pass_objs) == 1:
-            common_idx = pass_objs[0]
-        else:
-            common_idx = pass_objs[0].index.intersection(
-                [obj.index for obj in pass_objs[1:]]
-            )
-
-    data = {
-        **dict(zip(unique_rownames, index)),
-        **dict(zip(unique_colnames, columns)),
-    }
-    df = DataFrame(data, index=common_idx)
+        # If we have any Snowpark pandas objects in the index or columns, then we
+        # need to find the intersection of their indices, and only pick rows from
+        # the objects that have indices in the intersection of their indices.
+        # After we do that, we then need to append the non Snowpark pandas objects
+        # using the intersection of indices as the final index for the DataFrame object.
+        # First, we separate the objects into Snowpark pandas objects, and non-Snowpark
+        # pandas objects (while renaming them so that they have unique names).
+        rownames_idx = 0
+        row_idx_names = []
+        dfs = []
+        arrays = []
+        for obj in index:
+            if isinstance(obj, Series):
+                row_idx_names.append(obj.name)
+                df = pd.DataFrame(obj)
+                df.columns = [unique_rownames[rownames_idx]]
+                rownames_idx += 1
+                dfs.append(df)
+            elif isinstance(obj, DataFrame):
+                row_idx_names.extend(obj.columns)
+                obj.columns = unique_rownames[
+                    rownames_idx : rownames_idx + len(obj.columns)
+                ]
+                rownames_idx += len(obj.columns)
+                dfs.append(obj)
+            else:
+                row_idx_names.append(None)
+                df = pd.DataFrame(obj)
+                df.columns = unique_rownames[
+                    rownames_idx : rownames_idx + len(df.columns)
+                ]
+                rownames_idx += len(df.columns)
+                arrays.append(df)
+
+        colnames_idx = 0
+        col_idx_names = []
+        for obj in columns:
+            if isinstance(obj, Series):
+                col_idx_names.append(obj.name)
+                df = pd.DataFrame(obj)
+                df.columns = [unique_colnames[colnames_idx]]
+                colnames_idx += 1
+                dfs.append(df)
+            elif isinstance(obj, DataFrame):
+                col_idx_names.extend(obj.columns)
+                obj.columns = unique_colnames[
+                    colnames_idx : colnames_idx + len(obj.columns)
+                ]
+                colnames_idx += len(obj.columns)
+                dfs.append(obj)
+            else:
+                col_idx_names.append(None)
+                df = pd.DataFrame(obj)
+                df.columns = unique_colnames[
+                    colnames_idx : colnames_idx + len(df.columns)
+                ]
+                colnames_idx += len(df.columns)
+                arrays.append(df)
+
+        # Now, we have two lists - a list of Snowpark pandas objects, and a list of objects
+        # that were not passed in as Snowpark pandas objects, but that we have converted
+        # to Snowpark pandas objects to give them column names. We can perform inner joins
+        # on the dfs list to get a DataFrame with the final index (that is only an intersection
+        # of indices.)
+        df = dfs[0]
+        for right in dfs[1:]:
+            df = df.merge(right, left_index=True, right_index=True)
+
+        if len(arrays) > 0:
+            index = df.index
+            right_df = pd.concat(arrays, axis=1)
+            right_df.index = index
+            df = df.merge(right_df, left_index=True, right_index=True)
+    else:
+        data = {
+            **dict(zip(unique_rownames, index)),
+            **dict(zip(unique_colnames, columns)),
+        }
+        df = DataFrame(data)
 
     if values is None:
         df["__dummy__"] = 0
@@ -1904,6 +1990,12 @@ def crosstab(
         **kwargs,  # type: ignore[arg-type]
     )
 
+    if row_idx_names is not None and not user_passed_rownames:
+        table.index = table.index.set_names(row_idx_names)
+
+    if col_idx_names is not None and not user_passed_colnames:
+        table.columns = table.columns.set_names(col_idx_names)
+
     if aggfunc is None:
         # If no aggfunc is provided, we are computing frequencies. Since we use
         # pivot_table above, pairs that are not observed will get a NaN value,
@@ -1923,7 +2015,7 @@ def crosstab(
         normalizers: dict[bool | str, Callable] = {
             "all": lambda x: x / x.sum(axis=0).sum(),
             "columns": lambda x: x / x.sum(),
-            "index": lambda x: x.div(x.sum(axis=1), axis=0),
+            "index": lambda x: x.div(x.sum(axis=1)),
         }
 
         if margins is False:
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 00ee487e5bc..27f3d88c536 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5272,11 +5272,14 @@ def agg(
                     )
                     for agg_arg in agg_args
                 }
+                pandas_labels = list(agg_col_map.keys())
+                if self.is_multiindex(axis=1):
+                    pandas_labels = [
+                        (label,) * len(self.columns.names) for label in pandas_labels
+                    ]
                 single_agg_func_query_compilers.append(
                     SnowflakeQueryCompiler(
-                        frame.project_columns(
-                            list(agg_col_map.keys()), list(agg_col_map.values())
-                        )
+                        frame.project_columns(pandas_labels, list(agg_col_map.values()))
                     )
                 )
         else:  # axis == 0
@@ -13816,9 +13819,7 @@ def infer_sorted_column_labels(
             new_frame = InternalFrame.create(
                 ordered_dataframe=expanded_ordered_frame,
                 data_column_pandas_labels=sorted_column_labels,
-                data_column_pandas_index_names=[
-                    None
-                ],  # operation removes column index name always.
+                data_column_pandas_index_names=self._modin_frame.data_column_pandas_index_names,
                 data_column_snowflake_quoted_identifiers=frame.data_column_snowflake_quoted_identifiers
                 + new_identifiers,
                 index_column_pandas_labels=index_column_pandas_labels,
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 93c40c2691c..9ab416e0e14 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -2,6 +2,8 @@
 # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
 #
 
+import re
+
 import modin.pandas as pd
 import numpy as np
 import pandas as native_pd
@@ -9,7 +11,10 @@
 
 import snowflake.snowpark.modin.plugin  # noqa: F401
 from tests.integ.modin.sql_counter import sql_count_checker
-from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas
+from tests.integ.modin.utils import (
+    assert_snowpark_pandas_equal_to_pandas,
+    eval_snowpark_pandas_result,
+)
 
 
 @sql_count_checker(query_count=4, join_count=1)
@@ -43,6 +48,251 @@ def test_basic_crosstab():
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
+@sql_count_checker(query_count=6, join_count=14)
+def test_basic_crosstab_with_series_objs_full_overlap():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = native_pd.Series(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+    )
+    c = native_pd.Series(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+    )
+    native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+    snow_df = pd.crosstab(
+        a, [pd.Series(b), pd.Series(c)], rownames=["a"], colnames=["b", "c"]
+    )
+    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=6, join_count=14)
+def test_basic_crosstab_with_series_objs_some_overlap():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = native_pd.Series(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        index=list(range(len(a))),
+    )
+    c = native_pd.Series(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        index=-1 * np.array(list(range(len(a)))),
+    )
+
+    # All columns have to be the same length (if NumPy arrays are present, then
+    # pandas errors if they do not match the length of the other Series after
+    # they are joined (i.e. filtered so that their indices are the same)). In
+    # this test, we truncate the numpy column so that the lengths are correct.
+    def eval_func(args_list):
+        a, b, c = args_list
+        if isinstance(b, native_pd.Series):
+            return native_pd.crosstab(
+                a[:1], [b, c], rownames=["a"], colnames=["b", "c"]
+            )
+        else:
+            return pd.crosstab(a[:1], [b, c], rownames=["a"], colnames=["b", "c"])
+
+    native_args = [a, b, c]
+    snow_args = [a, pd.Series(b), pd.Series(c)]
+    eval_snowpark_pandas_result(
+        snow_args,
+        native_args,
+        eval_func,
+    )
+
+
+@sql_count_checker(query_count=2, join_count=1)
+def test_basic_crosstab_with_series_objs_some_overlap_error():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = native_pd.Series(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        index=list(range(len(a))),
+    )
+    c = native_pd.Series(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        index=-1 * np.array(list(range(len(a)))),
+    )
+
+    # All columns have to be the same length (if NumPy arrays are present, then
+    # pandas errors if they do not match the length of the other Series after
+    # they are joined (i.e. filtered so that their indices are the same))
+    def eval_func(args_list):
+        a, b, c = args_list
+        if isinstance(b, native_pd.Series):
+            return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+        else:
+            return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+
+    native_args = [a, b, c]
+    snow_args = [a, pd.Series(b), pd.Series(c)]
+    eval_snowpark_pandas_result(
+        snow_args,
+        native_args,
+        eval_func,
+        expect_exception=True,
+        expect_exception_match=re.escape(
+            "Length mismatch: Expected 11 rows, received array of length 1"
+        ),
+        expect_exception_type=ValueError,
+        assert_exception_equal=False,  # Our error message is a little different.
+    )
+
+
+@sql_count_checker(query_count=2, join_count=1)
+def test_basic_crosstab_with_series_objs_no_overlap_error():
+    a = np.array(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = native_pd.Series(
+        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
+        index=list(range(len(a))),
+    )
+    c = native_pd.Series(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        index=-1 - np.array(list(range(len(a)))),
+    )
+
+    # All columns have to be the same length (if NumPy arrays are present, then
+    # pandas errors if they do not match the length of the other Series after
+    # they are joined (i.e. filtered so that their indices are the same))
+    def eval_func(args_list):
+        a, b, c = args_list
+        if isinstance(b, native_pd.Series):
+            return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+        else:
+            return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
+
+    native_args = [a, b, c]
+    snow_args = [a, pd.Series(b), pd.Series(c)]
+    eval_snowpark_pandas_result(
+        snow_args,
+        native_args,
+        eval_func,
+        expect_exception=True,
+        expect_exception_match=re.escape(
+            "Length mismatch: Expected 11 rows, received array of length 0"
+        ),
+        expect_exception_type=ValueError,
+        assert_exception_equal=False,  # Our error message is a little different.
+    )
+
+
+@sql_count_checker(query_count=7, join_count=4)
+def test_basic_crosstab_with_df_and_series_objs_pandas_errors():
+    a = native_pd.Series(
+        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
+        dtype=object,
+    )
+    b = native_pd.DataFrame(
+        {
+            "0": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "1": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+        }
+    )
+    # pandas expects only Series objects, or DataFrames that have only a single column, while
+    # we support accepting DataFrames with multiple columns.
+    with pytest.raises(
+        AssertionError, match="arrays and names must have the same length"
+    ):
+        native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"])
+
+    def eval_func(args_list):
+        a, b = args_list
+        if isinstance(a, native_pd.Series):
+            return native_pd.crosstab(
+                a, [b[c] for c in b.columns], rownames=["a"], colnames=["b", "c"]
+            )
+        else:
+            return pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"])
+
+    native_args = [a, b]
+    snow_args = [pd.Series(a), pd.DataFrame(b)]
+    eval_snowpark_pandas_result(
+        snow_args,
+        native_args,
+        eval_func,
+    )
+
+
 @sql_count_checker(query_count=4, join_count=4, union_count=3)
 def test_margins():
     a = np.array(
@@ -204,7 +454,12 @@ def test_values(aggfunc):
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result)
 
 
-@sql_count_checker(query_count=4, join_count=7)
+@sql_count_checker(
+    query_count=11,
+    join_count=14,
+    high_count_expected=True,
+    high_count_reason="Need to perform multiple joins followed by a pivot_table, which itself includes multiple joins.",
+)
 @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
 def test_values_series_like(aggfunc):
     native_df = native_pd.DataFrame(

From f76a0341267ea5222673661c1e3ef8f37724930e Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 20 Aug 2024 18:27:45 -0700
Subject: [PATCH 05/20] Add support for everything but normalize + margins
 together

---
 .../snowpark/modin/pandas/general.py          |  2 +-
 .../compiler/snowflake_query_compiler.py      | 30 +++++++++++++-
 tests/integ/modin/crosstab/test_crosstab.py   | 40 +++++++++----------
 3 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 51787b54d53..0d0bf4cdeb6 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2015,7 +2015,7 @@ def _get_names_wrapper(list_of_objs, names, prefix):
         normalizers: dict[bool | str, Callable] = {
             "all": lambda x: x / x.sum(axis=0).sum(),
             "columns": lambda x: x / x.sum(),
-            "index": lambda x: x.div(x.sum(axis=1)),
+            "index": lambda x: x.div(x.sum(axis=1), axis="index"),
         }
 
         if margins is False:
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 13c524f7762..4e89dbdd7e9 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -13673,7 +13673,6 @@ def create_lazy_type_functions(
         assert len(right_result_data_identifiers) == 1, "other must be a Series"
         right = right_result_data_identifiers[0]
         right_datatype = right_datatypes[0]
-
         # now replace in result frame identifiers with binary op result
         update_result = joined_frame.result_frame.update_snowflake_quoted_identifiers_with_expressions(
             {
@@ -13691,6 +13690,7 @@ def create_lazy_type_functions(
         identifiers_to_keep = set(
             new_frame.index_column_snowflake_quoted_identifiers
         ) | set(update_result.old_id_to_new_id_mappings.values())
+        self_is_column_mi = len(self._modin_frame.data_column_pandas_index_names)
         label_to_snowflake_quoted_identifier = tuple(
             filter(
                 lambda pair: pair.snowflake_quoted_identifier in identifiers_to_keep,
@@ -13698,11 +13698,37 @@ def create_lazy_type_functions(
             )
         )
 
+        first_column_label = label_to_snowflake_quoted_identifier[
+            new_frame.num_index_columns
+        ].label
+
+        if (
+            self_is_column_mi
+            and isinstance(first_column_label, tuple)
+            and isinstance(first_column_label[0], tuple)
+        ):
+            from snowflake.snowpark.modin.plugin._internal.frame import (
+                LabelIdentifierPair,
+            )
+
+            new_label_to_snowflake_quoted_identifier = list(
+                label_to_snowflake_quoted_identifier[: new_frame.num_index_columns]
+            )
+            for pair in label_to_snowflake_quoted_identifier[
+                new_frame.num_index_columns :
+            ]:
+                new_label_to_snowflake_quoted_identifier.append(
+                    LabelIdentifierPair(pair.label[0], pair.snowflake_quoted_identifier)
+                )
+            label_to_snowflake_quoted_identifier = tuple(
+                new_label_to_snowflake_quoted_identifier
+            )
+
         new_frame = InternalFrame(
             ordered_dataframe=new_frame.ordered_dataframe,
             label_to_snowflake_quoted_identifier=label_to_snowflake_quoted_identifier,
             num_index_columns=new_frame.num_index_columns,
-            data_column_index_names=new_frame.data_column_index_names,
+            data_column_index_names=self._modin_frame.data_column_index_names,
             snowflake_quoted_identifier_to_snowpark_pandas_type={
                 pair.snowflake_quoted_identifier: None
                 for pair in label_to_snowflake_quoted_identifier
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 9ab416e0e14..3d6bd149b24 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -10,14 +10,14 @@
 import pytest
 
 import snowflake.snowpark.modin.plugin  # noqa: F401
-from tests.integ.modin.sql_counter import sql_count_checker
+from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker
 from tests.integ.modin.utils import (
     assert_snowpark_pandas_equal_to_pandas,
     eval_snowpark_pandas_result,
 )
 
 
-@sql_count_checker(query_count=4, join_count=1)
+@sql_count_checker(query_count=3)
 def test_basic_crosstab():
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
@@ -48,7 +48,7 @@ def test_basic_crosstab():
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
-@sql_count_checker(query_count=6, join_count=14)
+@sql_count_checker(query_count=5, join_count=13)
 def test_basic_crosstab_with_series_objs_full_overlap():
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
@@ -79,7 +79,7 @@ def test_basic_crosstab_with_series_objs_full_overlap():
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
-@sql_count_checker(query_count=6, join_count=14)
+@sql_count_checker(query_count=5, join_count=13)
 def test_basic_crosstab_with_series_objs_some_overlap():
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
@@ -232,7 +232,7 @@ def eval_func(args_list):
     )
 
 
-@sql_count_checker(query_count=7, join_count=4)
+@sql_count_checker(query_count=6, join_count=3)
 def test_basic_crosstab_with_df_and_series_objs_pandas_errors():
     a = native_pd.Series(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
@@ -293,7 +293,7 @@ def eval_func(args_list):
     )
 
 
-@sql_count_checker(query_count=4, join_count=4, union_count=3)
+@sql_count_checker(query_count=3, join_count=3, union_count=3)
 def test_margins():
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
@@ -338,9 +338,10 @@ def test_margins():
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
-@sql_count_checker(query_count=5, join_count=1)
 @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
 def test_normalize(normalize):
+    query_count = 4 if normalize not in (0, "index") else 3
+    join_count = 0 if normalize not in (0, "index") else 3
     a = np.array(
         ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
         dtype=object,
@@ -365,16 +366,17 @@ def test_normalize(normalize):
         ],
         dtype=object,
     )
-    native_df = native_pd.crosstab(
-        a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
-    )
-    snow_df = pd.crosstab(
-        a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+    with SqlCounter(query_count=query_count, join_count=join_count):
+        native_df = native_pd.crosstab(
+            a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+        )
+        snow_df = pd.crosstab(
+            a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+        )
+        assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
-@sql_count_checker(query_count=6, join_count=12, union_count=8)
+@sql_count_checker(query_count=5, join_count=11, union_count=8)
 @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
 def test_normalize_and_margins(normalize):
     a = np.array(
@@ -420,7 +422,7 @@ def test_normalize_and_margins(normalize):
     assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
-@sql_count_checker(query_count=4, join_count=7)
+@sql_count_checker(query_count=3, join_count=6)
 @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
 def test_values(aggfunc):
     native_df = native_pd.DataFrame(
@@ -455,10 +457,8 @@ def test_values(aggfunc):
 
 
 @sql_count_checker(
-    query_count=11,
-    join_count=14,
-    high_count_expected=True,
-    high_count_reason="Need to perform multiple joins followed by a pivot_table, which itself includes multiple joins.",
+    query_count=9,
+    join_count=10,
 )
 @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
 def test_values_series_like(aggfunc):

From d9c759fbc657d4074a70e3390697388a858b04b9 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 20 Aug 2024 18:40:17 -0700
Subject: [PATCH 06/20] Add support for normalize + margins on rows

---
 src/snowflake/snowpark/modin/pandas/general.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 0d0bf4cdeb6..0f320c19c4e 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2050,8 +2050,8 @@ def _get_names_wrapper(list_of_objs, names, prefix):
                 table.columns = table_columns
 
             elif normalize == "index":
-                index_margin = index_margin / index_margin.sum()
-                table = table.append(index_margin, ignore_index=True)
+                index_margin = pd.DataFrame(index_margin / index_margin.sum()).T
+                table = pd.concat([table, index_margin]).reset_index(drop=True)
                 table = table.fillna(0)
                 table.index = table_index
 

From c84fe32c64c3365f948d0784bac8e967c728cc01 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 21 Aug 2024 16:14:34 -0700
Subject: [PATCH 07/20] Wrap up implementation of crosstab

---
 .../snowpark/modin/pandas/general.py          |   38 +-
 tests/integ/modin/crosstab/test_crosstab.py   | 1147 ++++++++++-------
 2 files changed, 729 insertions(+), 456 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 0f320c19c4e..88560936fdf 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1986,7 +1986,6 @@ def _get_names_wrapper(list_of_objs, names, prefix):
         margins=margins,
         margins_name=margins_name,
         dropna=dropna,
-        # observed=dropna,
         **kwargs,  # type: ignore[arg-type]
     )
 
@@ -2031,36 +2030,37 @@ def _get_names_wrapper(list_of_objs, names, prefix):
             table_columns = table.columns
 
             column_margin = table.iloc[:-1, -1]
-            index_margin = table.iloc[-1, :-1]
 
-            # keep the core table
-            table = table.iloc[:-1, :-1]
-
-            # Normalize core
-            f = normalizers[normalize]
-
-            table = f(table)
-            table = table.fillna(0)
-
-            # Fix Margins
             if normalize == "columns":
+                # keep the core table
+                table = table.iloc[:-1, :-1]
+
+                # Normalize core
+                f = normalizers[normalize]
+                table = f(table)
+                table = table.fillna(0)
+                # Fix Margins
                 column_margin = column_margin / column_margin.sum()
                 table = pd.concat([table, column_margin], axis=1)
                 table = table.fillna(0)
                 table.columns = table_columns
 
             elif normalize == "index":
-                index_margin = pd.DataFrame(index_margin / index_margin.sum()).T
-                table = pd.concat([table, index_margin]).reset_index(drop=True)
-                table = table.fillna(0)
-                table.index = table_index
+                table = table.iloc[:, :-1]
+
+                # Normalize core
+                f = normalizers[normalize]
+                table = f(table)
+                table = table.fillna(0).reindex(index=table_index)
 
             elif normalize == "all":
+                # Normalize core
+                f = normalizers[normalize]
+                table = f(table.iloc[:, :-1]) * 2.0
+
                 column_margin = column_margin / column_margin.sum()
-                index_margin = index_margin / index_margin.sum()
-                index_margin.loc[margins_name] = 1
                 table = pd.concat([table, column_margin], axis=1)
-                table = table.append(index_margin, ignore_index=True)
+                table.iloc[-1, -1] = 1
 
                 table = table.fillna(0)
                 table.index = table_index
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 3d6bd149b24..8dace579af7 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -17,230 +17,89 @@
 )
 
 
-@sql_count_checker(query_count=3)
-def test_basic_crosstab():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = np.array(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        dtype=object,
-    )
-    c = np.array(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        dtype=object,
-    )
-    native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-    snow_df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
-
-
-@sql_count_checker(query_count=5, join_count=13)
-def test_basic_crosstab_with_series_objs_full_overlap():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = native_pd.Series(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-    )
-    c = native_pd.Series(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-    )
-    native_df = native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-    snow_df = pd.crosstab(
-        a, [pd.Series(b), pd.Series(c)], rownames=["a"], colnames=["b", "c"]
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
-
-
-@sql_count_checker(query_count=5, join_count=13)
-def test_basic_crosstab_with_series_objs_some_overlap():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = native_pd.Series(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        index=list(range(len(a))),
-    )
-    c = native_pd.Series(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        index=-1 * np.array(list(range(len(a)))),
-    )
-
-    # All columns have to be the same length (if NumPy arrays are present, then
-    # pandas errors if they do not match the length of the other Series after
-    # they are joined (i.e. filtered so that their indices are the same)). In
-    # this test, we truncate the numpy column so that the lengths are correct.
-    def eval_func(args_list):
-        a, b, c = args_list
-        if isinstance(b, native_pd.Series):
-            return native_pd.crosstab(
-                a[:1], [b, c], rownames=["a"], colnames=["b", "c"]
+@pytest.mark.parametrize("dropna", [True, False])
+class TestCrosstab:
+    def test_basic_crosstab(self, dropna):
+        query_count = 3
+        join_count = 0 if dropna else 3
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_df = native_pd.crosstab(
+                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
             )
-        else:
-            return pd.crosstab(a[:1], [b, c], rownames=["a"], colnames=["b", "c"])
-
-    native_args = [a, b, c]
-    snow_args = [a, pd.Series(b), pd.Series(c)]
-    eval_snowpark_pandas_result(
-        snow_args,
-        native_args,
-        eval_func,
-    )
-
-
-@sql_count_checker(query_count=2, join_count=1)
-def test_basic_crosstab_with_series_objs_some_overlap_error():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = native_pd.Series(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        index=list(range(len(a))),
-    )
-    c = native_pd.Series(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        index=-1 * np.array(list(range(len(a)))),
-    )
-
-    # All columns have to be the same length (if NumPy arrays are present, then
-    # pandas errors if they do not match the length of the other Series after
-    # they are joined (i.e. filtered so that their indices are the same))
-    def eval_func(args_list):
-        a, b, c = args_list
-        if isinstance(b, native_pd.Series):
-            return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-        else:
-            return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-
-    native_args = [a, b, c]
-    snow_args = [a, pd.Series(b), pd.Series(c)]
-    eval_snowpark_pandas_result(
-        snow_args,
-        native_args,
-        eval_func,
-        expect_exception=True,
-        expect_exception_match=re.escape(
-            "Length mismatch: Expected 11 rows, received array of length 1"
-        ),
-        expect_exception_type=ValueError,
-        assert_exception_equal=False,  # Our error message is a little different.
-    )
-
-
-@sql_count_checker(query_count=2, join_count=1)
-def test_basic_crosstab_with_series_objs_no_overlap_error():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = native_pd.Series(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        index=list(range(len(a))),
-    )
-    c = native_pd.Series(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        index=-1 - np.array(list(range(len(a)))),
-    )
-
-    # All columns have to be the same length (if NumPy arrays are present, then
-    # pandas errors if they do not match the length of the other Series after
-    # they are joined (i.e. filtered so that their indices are the same))
-    def eval_func(args_list):
-        a, b, c = args_list
-        if isinstance(b, native_pd.Series):
-            return native_pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-        else:
-            return pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
-
-    native_args = [a, b, c]
-    snow_args = [a, pd.Series(b), pd.Series(c)]
-    eval_snowpark_pandas_result(
-        snow_args,
-        native_args,
-        eval_func,
-        expect_exception=True,
-        expect_exception_match=re.escape(
-            "Length mismatch: Expected 11 rows, received array of length 0"
-        ),
-        expect_exception_type=ValueError,
-        assert_exception_equal=False,  # Our error message is a little different.
-    )
-
-
-@sql_count_checker(query_count=6, join_count=3)
-def test_basic_crosstab_with_df_and_series_objs_pandas_errors():
-    a = native_pd.Series(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = native_pd.DataFrame(
-        {
-            "0": [
+            snow_df = pd.crosstab(
+                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+    def test_basic_crosstab_with_series_objs_full_overlap(self, dropna):
+        query_count = 5
+        join_count = 13 if dropna else 28
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = native_pd.Series(
+            [
                 "one",
                 "one",
                 "one",
@@ -253,7 +112,9 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors():
                 "two",
                 "one",
             ],
-            "1": [
+        )
+        c = native_pd.Series(
+            [
                 "dull",
                 "dull",
                 "shiny",
@@ -266,229 +127,641 @@ def test_basic_crosstab_with_df_and_series_objs_pandas_errors():
                 "shiny",
                 "shiny",
             ],
-        }
-    )
-    # pandas expects only Series objects, or DataFrames that have only a single column, while
-    # we support accepting DataFrames with multiple columns.
-    with pytest.raises(
-        AssertionError, match="arrays and names must have the same length"
-    ):
-        native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"])
-
-    def eval_func(args_list):
-        a, b = args_list
-        if isinstance(a, native_pd.Series):
-            return native_pd.crosstab(
-                a, [b[c] for c in b.columns], rownames=["a"], colnames=["b", "c"]
+        )
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_df = native_pd.crosstab(
+                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
             )
-        else:
-            return pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"])
-
-    native_args = [a, b]
-    snow_args = [pd.Series(a), pd.DataFrame(b)]
-    eval_snowpark_pandas_result(
-        snow_args,
-        native_args,
-        eval_func,
-    )
-
-
-@sql_count_checker(query_count=3, join_count=3, union_count=3)
-def test_margins():
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = np.array(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        dtype=object,
-    )
-    c = np.array(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        dtype=object,
-    )
-    native_df = native_pd.crosstab(
-        a,
-        [b, c],
-        rownames=["a"],
-        colnames=["b", "c"],
-        margins=True,
-        margins_name="MARGINS_NAME",
-    )
-    snow_df = pd.crosstab(
-        a,
-        [b, c],
-        rownames=["a"],
-        colnames=["b", "c"],
-        margins=True,
-        margins_name="MARGINS_NAME",
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+            snow_df = pd.crosstab(
+                a,
+                [pd.Series(b), pd.Series(c)],
+                rownames=["a"],
+                colnames=["b", "c"],
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+    def test_basic_crosstab_with_series_objs_some_overlap(self, dropna):
+        query_count = 5
+        join_count = 13 if dropna else 28
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = native_pd.Series(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            index=list(range(len(a))),
+        )
+        c = native_pd.Series(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            index=-1 * np.array(list(range(len(a)))),
+        )
 
+        # All columns have to be the same length (if NumPy arrays are present, then
+        # pandas errors if they do not match the length of the other Series after
+        # they are joined (i.e. filtered so that their indices are the same)). In
+        # this test, we truncate the numpy column so that the lengths are correct.
+        def eval_func(args_list):
+            a, b, c = args_list
+            if isinstance(b, native_pd.Series):
+                return native_pd.crosstab(
+                    a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+            else:
+                return pd.crosstab(
+                    a[:1], [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_args = [a, b, c]
+            snow_args = [a, pd.Series(b), pd.Series(c)]
+            eval_snowpark_pandas_result(
+                snow_args,
+                native_args,
+                eval_func,
+            )
 
-@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
-def test_normalize(normalize):
-    query_count = 4 if normalize not in (0, "index") else 3
-    join_count = 0 if normalize not in (0, "index") else 3
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = np.array(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        dtype=object,
-    )
-    c = np.array(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        dtype=object,
-    )
-    with SqlCounter(query_count=query_count, join_count=join_count):
-        native_df = native_pd.crosstab(
-            a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+    @sql_count_checker(query_count=2, join_count=1)
+    def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna):
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = native_pd.Series(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            index=list(range(len(a))),
         )
-        snow_df = pd.crosstab(
-            a, [b, c], rownames=["a"], colnames=["b", "c"], normalize=normalize
+        c = native_pd.Series(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            index=-1 * np.array(list(range(len(a)))),
         )
-        assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
+        # All columns have to be the same length (if NumPy arrays are present, then
+        # pandas errors if they do not match the length of the other Series after
+        # they are joined (i.e. filtered so that their indices are the same))
+        def eval_func(args_list):
+            a, b, c = args_list
+            if isinstance(b, native_pd.Series):
+                return native_pd.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+            else:
+                return pd.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+
+        native_args = [a, b, c]
+        snow_args = [a, pd.Series(b), pd.Series(c)]
+        eval_snowpark_pandas_result(
+            snow_args,
+            native_args,
+            eval_func,
+            expect_exception=True,
+            expect_exception_match=re.escape(
+                "Length mismatch: Expected 11 rows, received array of length 1"
+            ),
+            expect_exception_type=ValueError,
+            assert_exception_equal=False,  # Our error message is a little different.
+        )
 
-@sql_count_checker(query_count=5, join_count=11, union_count=8)
-@pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
-def test_normalize_and_margins(normalize):
-    a = np.array(
-        ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo"],
-        dtype=object,
-    )
-    b = np.array(
-        ["one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one"],
-        dtype=object,
-    )
-    c = np.array(
-        [
-            "dull",
-            "dull",
-            "shiny",
-            "dull",
-            "dull",
-            "shiny",
-            "shiny",
-            "dull",
-            "shiny",
-            "shiny",
-            "shiny",
-        ],
-        dtype=object,
-    )
-    native_df = native_pd.crosstab(
-        a,
-        [b, c],
-        rownames=["a"],
-        colnames=["b", "c"],
-        normalize=normalize,
-        margins=True,
-    )
-    snow_df = pd.crosstab(
-        a,
-        [b, c],
-        rownames=["a"],
-        colnames=["b", "c"],
-        normalize=normalize,
-        margins=True,
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+    @sql_count_checker(query_count=2, join_count=1)
+    def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna):
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = native_pd.Series(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            index=list(range(len(a))),
+        )
+        c = native_pd.Series(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            index=-1 - np.array(list(range(len(a)))),
+        )
 
+        # All columns have to be the same length (if NumPy arrays are present, then
+        # pandas errors if they do not match the length of the other Series after
+        # they are joined (i.e. filtered so that their indices are the same))
+        def eval_func(args_list):
+            a, b, c = args_list
+            if isinstance(b, native_pd.Series):
+                return native_pd.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+            else:
+                return pd.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+
+        native_args = [a, b, c]
+        snow_args = [a, pd.Series(b), pd.Series(c)]
+        eval_snowpark_pandas_result(
+            snow_args,
+            native_args,
+            eval_func,
+            expect_exception=True,
+            expect_exception_match=re.escape(
+                "Length mismatch: Expected 11 rows, received array of length 0"
+            ),
+            expect_exception_type=ValueError,
+            assert_exception_equal=False,  # Our error message is a little different.
+        )
 
-@sql_count_checker(query_count=3, join_count=6)
-@pytest.mark.parametrize("aggfunc", ["mean", "sum"])
-def test_values(aggfunc):
-    native_df = native_pd.DataFrame(
-        {
-            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-            "favorite_food": [
-                "chicken",
-                "fish",
-                "fish",
-                "beef",
-                "chicken",
-                "beef",
-                "fish",
-                "beef",
+    def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna):
+        query_count = 6
+        join_count = 3 if dropna else 9
+        a = native_pd.Series(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
             ],
-            "age": [7, 2, 8, 5, 9, 3, 6, 1],
-        }
-    )
-    native_df_result = native_pd.crosstab(
-        native_df["species"].values,
-        native_df["favorite_food"].values,
-        values=native_df["age"].values,
-        aggfunc=aggfunc,
-    )
-    snow_df = pd.crosstab(
-        native_df["species"].values,
-        native_df["favorite_food"].values,
-        values=native_df["age"].values,
-        aggfunc=aggfunc,
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result)
-
+            dtype=object,
+        )
+        b = native_pd.DataFrame(
+            {
+                "0": [
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "two",
+                    "two",
+                    "one",
+                ],
+                "1": [
+                    "dull",
+                    "dull",
+                    "shiny",
+                    "dull",
+                    "dull",
+                    "shiny",
+                    "shiny",
+                    "dull",
+                    "shiny",
+                    "shiny",
+                    "shiny",
+                ],
+            }
+        )
+        # pandas expects only Series objects, or DataFrames that have only a single column, while
+        # we support accepting DataFrames with multiple columns.
+        with pytest.raises(
+            AssertionError, match="arrays and names must have the same length"
+        ):
+            native_pd.crosstab(a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna)
+
+        def eval_func(args_list):
+            a, b = args_list
+            if isinstance(a, native_pd.Series):
+                return native_pd.crosstab(
+                    a,
+                    [b[c] for c in b.columns],
+                    rownames=["a"],
+                    colnames=["b", "c"],
+                    dropna=dropna,
+                )
+            else:
+                return pd.crosstab(
+                    a, b, rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_args = [a, b]
+            snow_args = [pd.Series(a), pd.DataFrame(b)]
+            eval_snowpark_pandas_result(
+                snow_args,
+                native_args,
+                eval_func,
+            )
 
-@sql_count_checker(
-    query_count=9,
-    join_count=10,
-)
-@pytest.mark.parametrize("aggfunc", ["mean", "sum"])
-def test_values_series_like(aggfunc):
-    native_df = native_pd.DataFrame(
-        {
-            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-            "favorite_food": [
-                "chicken",
-                "fish",
-                "fish",
-                "beef",
-                "chicken",
-                "beef",
-                "fish",
-                "beef",
+    def test_margins(self, dropna):
+        query_count = 3
+        join_count = 3 if dropna else 6
+        union_count = 3
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
             ],
-            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        with SqlCounter(
+            query_count=query_count, join_count=join_count, union_count=union_count
+        ):
+            native_df = native_pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                margins=True,
+                margins_name="MARGINS_NAME",
+                dropna=dropna,
+            )
+            snow_df = pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                margins=True,
+                margins_name="MARGINS_NAME",
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+    @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
+    def test_normalize(self, dropna, normalize):
+        query_count = 4 if normalize not in (0, "index") else 3
+        join_count = 0 if normalize not in (0, "index") else 3
+        if not dropna:
+            join_count = 9 if normalize in (0, "index") else 4
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_df = native_pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                normalize=normalize,
+                dropna=dropna,
+            )
+            snow_df = pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                normalize=normalize,
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+    @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
+    def test_normalize_and_margins(self, dropna, normalize):
+        counts = {
+            "columns": [5, 11 if dropna else 19, 8],
+            "index": [3, 15 if dropna else 24, 9],
+            "all": [7, 41 if dropna else 63, 22],
         }
-    )
-    snow_df = pd.DataFrame(native_df)
-    native_df = native_pd.crosstab(
-        native_df["species"],
-        native_df["favorite_food"],
-        values=native_df["age"],
-        aggfunc=aggfunc,
-    )
-    snow_df = pd.crosstab(
-        snow_df["species"],
-        snow_df["favorite_food"],
-        values=snow_df["age"],
-        aggfunc=aggfunc,
-    )
-    assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+        counts[0] = counts["index"]
+        counts[1] = counts["columns"]
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        if normalize is True:
+            sql_counts = counts["all"]
+        else:
+            sql_counts = counts[normalize]
+        with SqlCounter(
+            query_count=sql_counts[0],
+            join_count=sql_counts[1],
+            union_count=sql_counts[2],
+        ):
+            native_df = native_pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                normalize=normalize,
+                margins=True,
+                dropna=dropna,
+            )
+            snow_df = pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                normalize=normalize,
+                margins=True,
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+    @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+    def test_values(self, dropna, aggfunc):
+        query_count = 3
+        join_count = 6 if dropna else 15
+        native_df = native_pd.DataFrame(
+            {
+                "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+                "favorite_food": [
+                    "chicken",
+                    "fish",
+                    "fish",
+                    "beef",
+                    "chicken",
+                    "beef",
+                    "fish",
+                    "beef",
+                ],
+                "age": [7, 2, 8, 5, 9, 3, 6, 1],
+            }
+        )
+
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_df_result = native_pd.crosstab(
+                native_df["species"].values,
+                native_df["favorite_food"].values,
+                values=native_df["age"].values,
+                aggfunc=aggfunc,
+                dropna=dropna,
+            )
+            snow_df = pd.crosstab(
+                native_df["species"].values,
+                native_df["favorite_food"].values,
+                values=native_df["age"].values,
+                aggfunc=aggfunc,
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result)
+
+    @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+    def test_values_series_like(self, dropna, aggfunc):
+        query_count = 9
+        join_count = 10 if dropna else 25
+        native_df = native_pd.DataFrame(
+            {
+                "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+                "favorite_food": [
+                    "chicken",
+                    "fish",
+                    "fish",
+                    "beef",
+                    "chicken",
+                    "beef",
+                    "fish",
+                    "beef",
+                ],
+                "age": [7, 2, 8, 5, 9, 3, 6, 1],
+            }
+        )
+        snow_df = pd.DataFrame(native_df)
+
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_df = native_pd.crosstab(
+                native_df["species"],
+                native_df["favorite_food"],
+                values=native_df["age"],
+                aggfunc=aggfunc,
+                dropna=dropna,
+            )
+            snow_df = pd.crosstab(
+                snow_df["species"],
+                snow_df["favorite_food"],
+                values=snow_df["age"],
+                aggfunc=aggfunc,
+                dropna=dropna,
+            )
+            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)

From f668414698a8eef93a60545e55233ae63097a4e3 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 21 Aug 2024 16:24:37 -0700
Subject: [PATCH 08/20] Fix small bug after merge

---
 .../modin/plugin/compiler/snowflake_query_compiler.py    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 9c5f5027b6d..da2a001f4e3 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -195,7 +195,10 @@
     compute_bin_indices,
     preprocess_bins_for_cut,
 )
-from snowflake.snowpark.modin.plugin._internal.frame import InternalFrame
+from snowflake.snowpark.modin.plugin._internal.frame import (
+    InternalFrame,
+    LabelIdentifierPair,
+)
 from snowflake.snowpark.modin.plugin._internal.groupby_utils import (
     check_is_groupby_supported_by_snowflake,
     extract_groupby_column_pandas_labels,
@@ -13737,7 +13740,9 @@ def create_lazy_type_functions(
                     and isinstance(pair.label, tuple)
                     and isinstance(pair.label[0], tuple)
                 ):
-                    pair.label = pair.label[0]
+                    pair = LabelIdentifierPair(
+                        pair.label[0], pair.snowflake_quoted_identifier
+                    )
                 label_to_snowflake_quoted_identifier.append(pair)
                 snowflake_quoted_identifier_to_snowpark_pandas_type[
                     pair.snowflake_quoted_identifier

From f2ba2f4f061026ec24ce2782b80a1fac3dd51aff Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 21 Aug 2024 16:54:36 -0700
Subject: [PATCH 09/20] Add docstrings

---
 .../snowpark/modin/pandas/general.py          | 55 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 88560936fdf..d89e72eaf24 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1831,7 +1831,6 @@ def melt(
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
-@_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab")
 def crosstab(
     index,
     columns,
@@ -1846,6 +1845,60 @@ def crosstab(
 ) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Compute a simple cross tabulation of two (or more) factors.
+
+    By default, computes a frequency table of the factors unless an array
+    of values and an aggregation function are passed.
+
+    Parameters
+    ----------
+    index : array-like, Series, or list of arrays/Series
+        Values to group by in the rows.
+    columns : array-like, Series, or list of arrays/Series
+        Values to group by in the columns.
+    values : array-like, optional
+        Array of values to aggregate according to the factors.
+        Requires aggfunc be specified.
+    rownames : sequence, default None
+        If passed, must match number of row arrays passed.
+    colnames : sequence, default None
+        If passed, must match number of column arrays passed.
+    aggfunc : function, optional
+        If specified, requires values be specified as well.
+    margins : bool, default False
+        Add row/column margins (subtotals).
+    margins_name : str, default 'All'
+        Name of the row/column that will contain the totals when margins is True.
+    dropna : bool, default True
+        Do not include columns whose entries are all NaN.
+
+    normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
+        Normalize by dividing all values by the sum of values.
+
+        * If passed 'all' or True, will normalize over all values.
+        * If passed 'index' will normalize over each row.
+        * If passed 'columns' will normalize over each column.
+        * If margins is True, will also normalize margin values.
+
+    Returns
+    -------
+    DataFrame
+        Cross tabulation of the data.
+
+    Examples
+    --------
+    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
+    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
+    >>> b = np.array(["one", "one", "one", "two", "one", "one",
+    ...               "one", "two", "two", "two", "one"], dtype=object)
+    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
+    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
+    ...              dtype=object)
+    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) # doctest: +NORMALIZE_WHITESPACE
+    b    one        two
+    c   dull shiny dull shiny
+    a
+    bar    1     2    1     0
+    foo    2     2    1     2
     """
     if values is None and aggfunc is not None:
         raise ValueError("aggfunc cannot be used without values.")

From da12522e18cfb66d93c81b4403f051bd0377c4d7 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 27 Aug 2024 20:01:43 -0700
Subject: [PATCH 10/20] Fix tests, address review comments

---
 src/snowflake/snowpark/modin/pandas/base.py   |   1 +
 .../snowpark/modin/pandas/general.py          |  13 +-
 tests/integ/modin/crosstab/__init__.py        |   3 -
 tests/integ/modin/crosstab/test_crosstab.py   | 316 ++++++++++++------
 4 files changed, 233 insertions(+), 100 deletions(-)
 delete mode 100644 tests/integ/modin/crosstab/__init__.py

diff --git a/src/snowflake/snowpark/modin/pandas/base.py b/src/snowflake/snowpark/modin/pandas/base.py
index 9b5bd4d8d4e..c08cdee1386 100644
--- a/src/snowflake/snowpark/modin/pandas/base.py
+++ b/src/snowflake/snowpark/modin/pandas/base.py
@@ -801,6 +801,7 @@ def aggregate(
         # TypeError: got an unexpected keyword argument 'skipna'
         if is_dict_like(func) and not uses_named_kwargs:
             kwargs.clear()
+
         result = self.__constructor__(
             query_compiler=self._query_compiler.agg(
                 func=func,
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 3c5f48b6d4f..877ad4578d9 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2103,6 +2103,7 @@ def _get_names_wrapper(list_of_objs, names, prefix):
         row_idx_names = []
         dfs = []
         arrays = []
+        array_lengths = []
         for obj in index:
             if isinstance(obj, Series):
                 row_idx_names.append(obj.name)
@@ -2119,6 +2120,7 @@ def _get_names_wrapper(list_of_objs, names, prefix):
                 dfs.append(obj)
             else:
                 row_idx_names.append(None)
+                array_lengths.append(len(obj))
                 df = pd.DataFrame(obj)
                 df.columns = unique_rownames[
                     rownames_idx : rownames_idx + len(df.columns)
@@ -2144,6 +2146,7 @@ def _get_names_wrapper(list_of_objs, names, prefix):
                 dfs.append(obj)
             else:
                 col_idx_names.append(None)
+                array_lengths.append(len(obj))
                 df = pd.DataFrame(obj)
                 df.columns = unique_colnames[
                     colnames_idx : colnames_idx + len(df.columns)
@@ -2151,6 +2154,9 @@ def _get_names_wrapper(list_of_objs, names, prefix):
                 colnames_idx += len(df.columns)
                 arrays.append(df)
 
+        if len(set(array_lengths)) > 1:
+            raise ValueError("All arrays must be of the same length")
+
         # Now, we have two lists - a list of Snowpark pandas objects, and a list of objects
         # that were not passed in as Snowpark pandas objects, but that we have converted
         # to Snowpark pandas objects to give them column names. We can perform inner joins
@@ -2159,10 +2165,15 @@ def _get_names_wrapper(list_of_objs, names, prefix):
         df = dfs[0]
         for right in dfs[1:]:
             df = df.merge(right, left_index=True, right_index=True)
-
         if len(arrays) > 0:
             index = df.index
             right_df = pd.concat(arrays, axis=1)
+            # Increases query count by 1, but necessary for error checking.
+            index_length = len(df)
+            if index_length != array_lengths[0]:
+                raise ValueError(
+                    f"Length mismatch: Expected {array_lengths[0]} rows, received array of length {index_length}"
+                )
             right_df.index = index
             df = df.merge(right_df, left_index=True, right_index=True)
     else:
diff --git a/tests/integ/modin/crosstab/__init__.py b/tests/integ/modin/crosstab/__init__.py
deleted file mode 100644
index 0fbef920926..00000000000
--- a/tests/integ/modin/crosstab/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-#
-# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
-#
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 8dace579af7..48fa27bd85b 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -19,9 +19,9 @@
 
 @pytest.mark.parametrize("dropna", [True, False])
 class TestCrosstab:
-    def test_basic_crosstab(self, dropna):
-        query_count = 3
-        join_count = 0 if dropna else 3
+    def test_basic_crosstab_with_numpy_arrays(self, dropna):
+        query_count = 1
+        join_count = 0 if dropna else 1
         a = np.array(
             [
                 "foo",
@@ -71,17 +71,79 @@ def test_basic_crosstab(self, dropna):
             dtype=object,
         )
         with SqlCounter(query_count=query_count, join_count=join_count):
-            native_df = native_pd.crosstab(
-                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                ),
             )
-            snow_df = pd.crosstab(
-                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+
+    def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna):
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        with SqlCounter(query_count=0):
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                ),
+                assert_exception_equal=True,
+                expect_exception=True,
+                expect_exception_match="All arrays must be of the same length",
+                expect_exception_type=ValueError,
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
+    # In these tests, `overlap` refers to the intersection of the indices
+    # of the Series objects being passed in to crosstab. crosstab takes
+    # only the intersection of the index objects of all Series when determining
+    # the final DataFrame to pass into pivot_table, so here, we are testing
+    # that we follow that behavior.
     def test_basic_crosstab_with_series_objs_full_overlap(self, dropna):
-        query_count = 5
-        join_count = 13 if dropna else 28
+        # In this case, all indexes are identical - hence "full" overlap.
+        query_count = 2
+        join_count = 5 if dropna else 10
         a = np.array(
             [
                 "foo",
@@ -142,8 +204,14 @@ def test_basic_crosstab_with_series_objs_full_overlap(self, dropna):
             assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
     def test_basic_crosstab_with_series_objs_some_overlap(self, dropna):
-        query_count = 5
-        join_count = 13 if dropna else 28
+        # In this case, some values are shared across indexes (non-zero intersection),
+        # hence "some" overlap.
+        # When a mix of Series and non-Series objects are passed in, the non-Series
+        # objects are expected to have the same length as the intersection of the indexes
+        # of the Series objects. This test case passes because we pass in arrays that
+        # are the length of the intersection rather than the length of each of the Series.
+        query_count = 2
+        join_count = 5 if dropna else 10
         a = np.array(
             [
                 "foo",
@@ -217,8 +285,15 @@ def eval_func(args_list):
                 eval_func,
             )
 
-    @sql_count_checker(query_count=2, join_count=1)
+    @sql_count_checker(query_count=1, join_count=1)
     def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna):
+        # Same as above - the intersection of the indexes of the Series objects
+        # is non-zero, but the indexes are not identical - hence "some" overlap.
+        # When a mix of Series and non-Series objects are passed in, the non-Series
+        # objects are expected to have the same length as the intersection of the indexes
+        # of the Series objects. This test case errors because we pass in arrays that
+        # are the length of the Series, rather than the length of the intersection of
+        # the indexes of the Series.
         a = np.array(
             [
                 "foo",
@@ -296,8 +371,11 @@ def eval_func(args_list):
             assert_exception_equal=False,  # Our error message is a little different.
         )
 
-    @sql_count_checker(query_count=2, join_count=1)
+    @sql_count_checker(query_count=1, join_count=1)
     def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna):
+        # In this case, no values are shared across the indexes - the intersection is an
+        # empty set - hence "no" overlap. We error here for the same reason as above - the
+        # arrays passed in should also be empty, but are non-empty.
         a = np.array(
             [
                 "foo",
@@ -376,8 +454,8 @@ def eval_func(args_list):
         )
 
     def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna):
-        query_count = 6
-        join_count = 3 if dropna else 9
+        query_count = 4
+        join_count = 1 if dropna else 3
         a = native_pd.Series(
             [
                 "foo",
@@ -456,9 +534,9 @@ def eval_func(args_list):
             )
 
     def test_margins(self, dropna):
-        query_count = 3
-        join_count = 3 if dropna else 6
-        union_count = 3
+        query_count = 1
+        join_count = 1 if dropna else 2
+        union_count = 1
         a = np.array(
             [
                 "foo",
@@ -510,32 +588,26 @@ def test_margins(self, dropna):
         with SqlCounter(
             query_count=query_count, join_count=join_count, union_count=union_count
         ):
-            native_df = native_pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                margins=True,
-                margins_name="MARGINS_NAME",
-                dropna=dropna,
-            )
-            snow_df = pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                margins=True,
-                margins_name="MARGINS_NAME",
-                dropna=dropna,
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    a,
+                    [b, c],
+                    rownames=["a"],
+                    colnames=["b", "c"],
+                    margins=True,
+                    margins_name="MARGINS_NAME",
+                    dropna=dropna,
+                ),
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize(self, dropna, normalize):
-        query_count = 4 if normalize not in (0, "index") else 3
-        join_count = 0 if normalize not in (0, "index") else 3
-        if not dropna:
-            join_count = 9 if normalize in (0, "index") else 4
+        query_count = 1 if normalize in (0, "index") else 2
+        join_count = 3 if normalize in (0, "index") else 2
+        if dropna:
+            join_count -= 2
         a = np.array(
             [
                 "foo",
@@ -585,30 +657,25 @@ def test_normalize(self, dropna, normalize):
             dtype=object,
         )
         with SqlCounter(query_count=query_count, join_count=join_count):
-            native_df = native_pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                normalize=normalize,
-                dropna=dropna,
-            )
-            snow_df = pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                normalize=normalize,
-                dropna=dropna,
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    a,
+                    [b, c],
+                    rownames=["a"],
+                    colnames=["b", "c"],
+                    normalize=normalize,
+                    dropna=dropna,
+                ),
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize_and_margins(self, dropna, normalize):
         counts = {
-            "columns": [5, 11 if dropna else 19, 8],
-            "index": [3, 15 if dropna else 24, 9],
-            "all": [7, 41 if dropna else 63, 22],
+            "columns": [3, 5 if dropna else 9, 4],
+            "index": [1, 5 if dropna else 8, 3],
+            "all": [3, 12 if dropna else 19, 7],
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
@@ -669,30 +736,24 @@ def test_normalize_and_margins(self, dropna, normalize):
             join_count=sql_counts[1],
             union_count=sql_counts[2],
         ):
-            native_df = native_pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                normalize=normalize,
-                margins=True,
-                dropna=dropna,
-            )
-            snow_df = pd.crosstab(
-                a,
-                [b, c],
-                rownames=["a"],
-                colnames=["b", "c"],
-                normalize=normalize,
-                margins=True,
-                dropna=dropna,
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    a,
+                    [b, c],
+                    rownames=["a"],
+                    colnames=["b", "c"],
+                    normalize=normalize,
+                    margins=True,
+                    dropna=dropna,
+                ),
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
     @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
     def test_values(self, dropna, aggfunc):
-        query_count = 3
-        join_count = 6 if dropna else 15
+        query_count = 1
+        join_count = 2 if dropna else 5
         native_df = native_pd.DataFrame(
             {
                 "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
@@ -711,26 +772,22 @@ def test_values(self, dropna, aggfunc):
         )
 
         with SqlCounter(query_count=query_count, join_count=join_count):
-            native_df_result = native_pd.crosstab(
-                native_df["species"].values,
-                native_df["favorite_food"].values,
-                values=native_df["age"].values,
-                aggfunc=aggfunc,
-                dropna=dropna,
-            )
-            snow_df = pd.crosstab(
-                native_df["species"].values,
-                native_df["favorite_food"].values,
-                values=native_df["age"].values,
-                aggfunc=aggfunc,
-                dropna=dropna,
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                lambda lib: lib.crosstab(
+                    native_df["species"].values,
+                    native_df["favorite_food"].values,
+                    values=native_df["age"].values,
+                    aggfunc=aggfunc,
+                    dropna=dropna,
+                ),
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df_result)
 
     @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
     def test_values_series_like(self, dropna, aggfunc):
-        query_count = 9
-        join_count = 10 if dropna else 25
+        query_count = 5
+        join_count = 2 if dropna else 5
         native_df = native_pd.DataFrame(
             {
                 "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
@@ -765,3 +822,70 @@ def test_values_series_like(self, dropna, aggfunc):
                 dropna=dropna,
             )
             assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+
+
+@sql_count_checker(query_count=0)
+def test_values_unsupported_aggfunc():
+    native_df = native_pd.DataFrame(
+        {
+            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+            "favorite_food": [
+                "chicken",
+                "fish",
+                "fish",
+                "beef",
+                "chicken",
+                "beef",
+                "fish",
+                "beef",
+            ],
+            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+        }
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.",
+    ):
+        pd.crosstab(
+            native_df["species"].values,
+            native_df["favorite_food"].values,
+            values=native_df["age"].values,
+            aggfunc="median",
+            dropna=False,
+        )
+
+
+@sql_count_checker(query_count=4)
+def test_values_series_like_unsupported_aggfunc():
+    # The query count above comes from building the DataFrame
+    # that we pass in to pivot table.
+    native_df = native_pd.DataFrame(
+        {
+            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+            "favorite_food": [
+                "chicken",
+                "fish",
+                "fish",
+                "beef",
+                "chicken",
+                "beef",
+                "fish",
+                "beef",
+            ],
+            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+        }
+    )
+    snow_df = pd.DataFrame(native_df)
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Snowpark pandas DataFrame.pivot_table does not yet support the aggregation 'median' with the given arguments.",
+    ):
+        snow_df = pd.crosstab(
+            snow_df["species"],
+            snow_df["favorite_food"],
+            values=snow_df["age"],
+            aggfunc="median",
+            dropna=False,
+        )

From e8bcc6305451b2ede65c807dcb31b6e3ef01adb1 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 27 Aug 2024 20:05:19 -0700
Subject: [PATCH 11/20] Use eval_snowpark...

---
 tests/integ/modin/crosstab/test_crosstab.py | 68 ++++++++++++---------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 48fa27bd85b..9dfda0a60c0 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -11,10 +11,7 @@
 
 import snowflake.snowpark.modin.plugin  # noqa: F401
 from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker
-from tests.integ.modin.utils import (
-    assert_snowpark_pandas_equal_to_pandas,
-    eval_snowpark_pandas_result,
-)
+from tests.integ.modin.utils import eval_snowpark_pandas_result
 
 
 @pytest.mark.parametrize("dropna", [True, False])
@@ -190,18 +187,23 @@ def test_basic_crosstab_with_series_objs_full_overlap(self, dropna):
                 "shiny",
             ],
         )
+
+        def eval_func(lib):
+            if lib is pd:
+                return lib.crosstab(
+                    a,
+                    [lib.Series(b), lib.Series(c)],
+                    rownames=["a"],
+                    colnames=["b", "c"],
+                    dropna=dropna,
+                )
+            else:
+                return lib.crosstab(
+                    a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
+                )
+
         with SqlCounter(query_count=query_count, join_count=join_count):
-            native_df = native_pd.crosstab(
-                a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=dropna
-            )
-            snow_df = pd.crosstab(
-                a,
-                [pd.Series(b), pd.Series(c)],
-                rownames=["a"],
-                colnames=["b", "c"],
-                dropna=dropna,
-            )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
+            eval_snowpark_pandas_result(pd, native_pd, eval_func)
 
     def test_basic_crosstab_with_series_objs_some_overlap(self, dropna):
         # In this case, some values are shared across indexes (non-zero intersection),
@@ -806,22 +808,30 @@ def test_values_series_like(self, dropna, aggfunc):
         )
         snow_df = pd.DataFrame(native_df)
 
+        def eval_func(df):
+            if isinstance(df, pd.DataFrame):
+                return pd.crosstab(
+                    df["species"],
+                    df["favorite_food"],
+                    values=df["age"],
+                    aggfunc=aggfunc,
+                    dropna=dropna,
+                )
+            else:
+                return native_pd.crosstab(
+                    df["species"],
+                    df["favorite_food"],
+                    values=df["age"],
+                    aggfunc=aggfunc,
+                    dropna=dropna,
+                )
+
         with SqlCounter(query_count=query_count, join_count=join_count):
-            native_df = native_pd.crosstab(
-                native_df["species"],
-                native_df["favorite_food"],
-                values=native_df["age"],
-                aggfunc=aggfunc,
-                dropna=dropna,
-            )
-            snow_df = pd.crosstab(
-                snow_df["species"],
-                snow_df["favorite_food"],
-                values=snow_df["age"],
-                aggfunc=aggfunc,
-                dropna=dropna,
+            eval_snowpark_pandas_result(
+                snow_df,
+                native_df,
+                eval_func,
             )
-            assert_snowpark_pandas_equal_to_pandas(snow_df, native_df)
 
 
 @sql_count_checker(query_count=0)

From 6399f4ccc16d69da5c0398f29e40b8c1e6408fea Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 27 Aug 2024 20:06:27 -0700
Subject: [PATCH 12/20] Remove crosstab from unsupported tests

---
 tests/unit/modin/test_unsupported.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py
index ad0ae0aa5d6..8b07321a94c 100644
--- a/tests/unit/modin/test_unsupported.py
+++ b/tests/unit/modin/test_unsupported.py
@@ -45,7 +45,6 @@ def test_unsupported_io(io_method, kwargs):
     [
         ["merge_ordered", {"left": "", "right": ""}],
         ["value_counts", {"values": ""}],
-        ["crosstab", {"index": "", "columns": ""}],
         ["lreshape", {"data": "", "groups": ""}],
         ["wide_to_long", {"df": "", "stubnames": "", "i": "", "j": ""}],
         ["to_timedelta", {"arg": ""}],

From 53a41253c6776e98438cbf265b74003d93662f7a Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 27 Aug 2024 20:09:50 -0700
Subject: [PATCH 13/20] Fix docs, add all aggfuncs to tests

---
 docs/source/modin/supported/general_supported.rst | 3 ++-
 src/snowflake/snowpark/modin/pandas/general.py    | 5 +++++
 tests/integ/modin/crosstab/test_crosstab.py       | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
index 0a67e57b39b..d13ee994fb8 100644
--- a/docs/source/modin/supported/general_supported.rst
+++ b/docs/source/modin/supported/general_supported.rst
@@ -18,7 +18,8 @@ Data manipulations
 | ``concat``                  | P                               | ``levels`` is not supported,     |                                                    |
 |                             |                                 | ``copy`` is ignored              |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``crosstab``                | Y                               |                                  |                                                    |
+| ``crosstab``                | P                               |                                  | ``N`` if ``aggfunc`` is not one of                 |
+|                             |                                 |                                  | "count", "mean", "min", "max", or "sum"            |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``cut``                     | P                               | ``retbins``, ``labels``          | ``N`` if ``retbins=True``or ``labels!=False``      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 877ad4578d9..69a10cc08c0 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2031,6 +2031,11 @@ def crosstab(
     DataFrame
         Cross tabulation of the data.
 
+    Notes
+    -----
+
+    Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum".
+
     Examples
     --------
     >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 9dfda0a60c0..562f8eb506d 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -752,7 +752,7 @@ def test_normalize_and_margins(self, dropna, normalize):
                 ),
             )
 
-    @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values(self, dropna, aggfunc):
         query_count = 1
         join_count = 2 if dropna else 5
@@ -786,7 +786,7 @@ def test_values(self, dropna, aggfunc):
                 ),
             )
 
-    @pytest.mark.parametrize("aggfunc", ["mean", "sum"])
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values_series_like(self, dropna, aggfunc):
         query_count = 5
         join_count = 2 if dropna else 5

From 76c01e2ab2a2986d59534603a82c06e61ee5ef9b Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Tue, 27 Aug 2024 20:31:15 -0700
Subject: [PATCH 14/20] Fix docs

---
 .../modin/supported/general_supported.rst     |   4 +-
 .../snowpark/modin/pandas/general.py          |  12 +-
 tests/integ/modin/crosstab/test_crosstab.py   | 163 ++++++++++++++++++
 3 files changed, 177 insertions(+), 2 deletions(-)

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
index d13ee994fb8..cfd6d2d4c5a 100644
--- a/docs/source/modin/supported/general_supported.rst
+++ b/docs/source/modin/supported/general_supported.rst
@@ -19,7 +19,9 @@ Data manipulations
 |                             |                                 | ``copy`` is ignored              |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``crosstab``                | P                               |                                  | ``N`` if ``aggfunc`` is not one of                 |
-|                             |                                 |                                  | "count", "mean", "min", "max", or "sum"            |
+|                             |                                 |                                  | "count", "mean", "min", "max", or "sum", or        |
+|                             |                                 |                                  | margins is True, normalize is "all" or True,       |
+|                             |                                 |                                  | and values is passed.                              |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``cut``                     | P                               | ``retbins``, ``labels``          | ``N`` if ``retbins=True``or ``labels!=False``      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 69a10cc08c0..4bd70492c2f 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2034,7 +2034,8 @@ def crosstab(
     Notes
     -----
 
-    Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum".
+    Raises NotImplementedError if aggfunc is not one of "count", "mean", "min", "max", or "sum", or
+    margins is True, normalize is True or all, and values is passed.
 
     Examples
     --------
@@ -2063,6 +2064,15 @@ def crosstab(
     if not is_nested_list_like(columns):
         columns = [columns]
 
+    if (
+        values is not None
+        and margins is True
+        and (normalize is True or normalize == "all")
+    ):
+        raise NotImplementedError(
+            'Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.'
+        )
+
     user_passed_rownames = rownames is not None
     user_passed_colnames = colnames is not None
 
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 562f8eb506d..e30a4c2b4db 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -752,6 +752,169 @@ def test_normalize_and_margins(self, dropna, normalize):
                 ),
             )
 
+    @pytest.mark.parametrize("normalize", [0, 1, "index", "columns"])
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
+    def test_normalize_margins_and_values(self, dropna, normalize, aggfunc):
+        counts = {
+            "columns": [3, 29 if dropna else 41, 4],
+            "index": [1, 23 if dropna else 32, 3],
+            "all": [3, 54 if dropna else 75, 7],
+        }
+        counts[0] = counts["index"]
+        counts[1] = counts["columns"]
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
+        if normalize is True:
+            sql_counts = counts["all"]
+        else:
+            sql_counts = counts[normalize]
+
+        def eval_func(lib):
+            df = lib.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                values=vals,
+                normalize=normalize,
+                margins=True,
+                dropna=dropna,
+                aggfunc=aggfunc,
+            )
+            if aggfunc == "sum":
+                # For some reason, the rounding is different for sum.
+                df = df.round(decimals=6)
+            return df
+
+        with SqlCounter(
+            query_count=sql_counts[0],
+            join_count=sql_counts[1],
+            union_count=sql_counts[2],
+        ):
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                eval_func,
+            )
+
+    @pytest.mark.parametrize("normalize", ["all", True])
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
+    @sql_count_checker(query_count=0)
+    def test_normalize_margins_and_values_not_supported(
+        self, dropna, normalize, aggfunc
+    ):
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
+        with pytest.raises(
+            NotImplementedError,
+            match='Snowpark pandas does not yet support passing in margins=True, normalize="all", and values.',
+        ):
+            pd.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                values=vals,
+                normalize=normalize,
+                margins=True,
+                dropna=dropna,
+                aggfunc=aggfunc,
+            )
+
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values(self, dropna, aggfunc):
         query_count = 1

From 906e00b64f7f8efa7ebeef8bfadc8e10d973249b Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 28 Aug 2024 14:56:10 -0700
Subject: [PATCH 15/20] Add tests

---
 tests/integ/modin/crosstab/test_crosstab.py | 168 +++++++++++++++++++-
 1 file changed, 167 insertions(+), 1 deletion(-)

diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index e30a4c2b4db..2c156630ad8 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -829,7 +829,7 @@ def eval_func(lib):
                 aggfunc=aggfunc,
             )
             if aggfunc == "sum":
-                # For some reason, the rounding is different for sum.
+                # Thanks to our hack, the rounding is different for sum.
                 df = df.round(decimals=6)
             return df
 
@@ -844,6 +844,172 @@ def eval_func(lib):
                 eval_func,
             )
 
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
+    def test_margins_and_values(self, dropna, aggfunc):
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
+
+        def eval_func(lib):
+            df = lib.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                values=vals,
+                margins=True,
+                dropna=dropna,
+                aggfunc=aggfunc,
+            )
+            return df
+
+        with SqlCounter(
+            query_count=1,
+            join_count=7 if dropna else 10,
+            union_count=1,
+        ):
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                eval_func,
+            )
+
+    @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
+    @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
+    def test_normalize_and_values(self, dropna, normalize, aggfunc):
+        counts = {
+            "columns": [2, 4 if dropna else 10],
+            "index": [1, 5 if dropna else 11],
+            "all": [2, 4 if dropna else 10],
+        }
+        counts[0] = counts["index"]
+        counts[1] = counts["columns"]
+        a = np.array(
+            [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            dtype=object,
+        )
+        b = np.array(
+            [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            dtype=object,
+        )
+        c = np.array(
+            [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            dtype=object,
+        )
+        vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
+        if normalize is True:
+            sql_counts = counts["all"]
+        else:
+            sql_counts = counts[normalize]
+
+        def eval_func(lib):
+            df = lib.crosstab(
+                a,
+                [b, c],
+                rownames=["a"],
+                colnames=["b", "c"],
+                values=vals,
+                normalize=normalize,
+                dropna=dropna,
+                aggfunc=aggfunc,
+            )
+            if aggfunc in ["sum", "max"]:
+                # Thanks to our hack, the rounding is different for sum and max.
+                df = df.round(decimals=6)
+            return df
+
+        with SqlCounter(
+            query_count=sql_counts[0],
+            join_count=sql_counts[1],
+        ):
+            eval_snowpark_pandas_result(
+                pd,
+                native_pd,
+                eval_func,
+            )
+
     @pytest.mark.parametrize("normalize", ["all", True])
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     @sql_count_checker(query_count=0)

From 5cdc71210f546e0d99459c415f8f327a9ffdb5a5 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 28 Aug 2024 16:18:29 -0700
Subject: [PATCH 16/20] Address review comments

---
 .../snowpark/modin/pandas/general.py          |   9 +
 tests/integ/modin/crosstab/test_crosstab.py   | 774 ++----------------
 tests/integ/modin/crosstab/test_utils.py      |  91 ++
 3 files changed, 151 insertions(+), 723 deletions(-)
 create mode 100644 tests/integ/modin/crosstab/test_utils.py

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 4bd70492c2f..e0afb909fa9 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2282,6 +2282,15 @@ def _get_names_wrapper(list_of_objs, names, prefix):
             elif normalize == "all":
                 # Normalize core
                 f = normalizers[normalize]
+
+                # When we perform the normalization function, we take the sum over
+                # the rows, and divide every value by the sum. Since margins is included
+                # though, the result of the sum is actually 2 * the sum of the original
+                # values (since the margin itself is the sum of the original values),
+                # so we need to multiply by 2 here to account for that.
+                # The alternative would be to apply normalization to the main table
+                # and the index margins separately, but that would require additional joins
+                # to get the final table, which we want to avoid.
                 table = f(table.iloc[:, :-1]) * 2.0
 
                 column_margin = column_margin / column_margin.sum()
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 2c156630ad8..8c228e895da 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -16,57 +16,9 @@
 
 @pytest.mark.parametrize("dropna", [True, False])
 class TestCrosstab:
-    def test_basic_crosstab_with_numpy_arrays(self, dropna):
+    def test_basic_crosstab_with_numpy_arrays(self, dropna, a, b, c):
         query_count = 1
         join_count = 0 if dropna else 1
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
         with SqlCounter(query_count=query_count, join_count=join_count):
             eval_snowpark_pandas_result(
                 pd,
@@ -76,49 +28,10 @@ def test_basic_crosstab_with_numpy_arrays(self, dropna):
                 ),
             )
 
-    def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna):
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
+    def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna, a, b, c):
+        a = a[:-1]
+        b = b[:-2]
+        c = c[:-3]
         with SqlCounter(query_count=0):
             eval_snowpark_pandas_result(
                 pd,
@@ -137,56 +50,10 @@ def test_basic_crosstab_with_numpy_arrays_different_lengths(self, dropna):
     # only the intersection of the index objects of all Series when determining
     # the final DataFrame to pass into pivot_table, so here, we are testing
     # that we follow that behavior.
-    def test_basic_crosstab_with_series_objs_full_overlap(self, dropna):
+    def test_basic_crosstab_with_series_objs_full_overlap(self, dropna, a, b, c):
         # In this case, all indexes are identical - hence "full" overlap.
         query_count = 2
         join_count = 5 if dropna else 10
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = native_pd.Series(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-        )
-        c = native_pd.Series(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-        )
 
         def eval_func(lib):
             if lib is pd:
@@ -205,7 +72,7 @@ def eval_func(lib):
         with SqlCounter(query_count=query_count, join_count=join_count):
             eval_snowpark_pandas_result(pd, native_pd, eval_func)
 
-    def test_basic_crosstab_with_series_objs_some_overlap(self, dropna):
+    def test_basic_crosstab_with_series_objs_some_overlap(self, dropna, a, b, c):
         # In this case, some values are shared across indexes (non-zero intersection),
         # hence "some" overlap.
         # When a mix of Series and non-Series objects are passed in, the non-Series
@@ -214,52 +81,12 @@ def test_basic_crosstab_with_series_objs_some_overlap(self, dropna):
         # are the length of the intersection rather than the length of each of the Series.
         query_count = 2
         join_count = 5 if dropna else 10
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
         b = native_pd.Series(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
+            b,
             index=list(range(len(a))),
         )
         c = native_pd.Series(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
+            c,
             index=-1 * np.array(list(range(len(a)))),
         )
 
@@ -288,7 +115,7 @@ def eval_func(args_list):
             )
 
     @sql_count_checker(query_count=1, join_count=1)
-    def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna):
+    def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna, a, b, c):
         # Same as above - the intersection of the indexes of the Series objects
         # is non-zero, but the indexes are not identical - hence "some" overlap.
         # When a mix of Series and non-Series objects are passed in, the non-Series
@@ -296,52 +123,12 @@ def test_basic_crosstab_with_series_objs_some_overlap_error(self, dropna):
         # of the Series objects. This test case errors because we pass in arrays that
         # are the length of the Series, rather than the length of the intersection of
         # the indexes of the Series.
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
         b = native_pd.Series(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
+            b,
             index=list(range(len(a))),
         )
         c = native_pd.Series(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
+            c,
             index=-1 * np.array(list(range(len(a)))),
         )
 
@@ -374,56 +161,16 @@ def eval_func(args_list):
         )
 
     @sql_count_checker(query_count=1, join_count=1)
-    def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna):
+    def test_basic_crosstab_with_series_objs_no_overlap_error(self, dropna, a, b, c):
         # In this case, no values are shared across the indexes - the intersection is an
         # empty set - hence "no" overlap. We error here for the same reason as above - the
         # arrays passed in should also be empty, but are non-empty.
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
         b = native_pd.Series(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
+            b,
             index=list(range(len(a))),
         )
         c = native_pd.Series(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
+            c,
             index=-1 - np.array(list(range(len(a)))),
         )
 
@@ -455,53 +202,19 @@ def eval_func(args_list):
             assert_exception_equal=False,  # Our error message is a little different.
         )
 
-    def test_basic_crosstab_with_df_and_series_objs_pandas_errors(self, dropna):
+    def test_basic_crosstab_with_df_and_series_objs_pandas_errors(
+        self, dropna, a, b, c
+    ):
         query_count = 4
         join_count = 1 if dropna else 3
         a = native_pd.Series(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
+            a,
             dtype=object,
         )
         b = native_pd.DataFrame(
             {
-                "0": [
-                    "one",
-                    "one",
-                    "one",
-                    "two",
-                    "one",
-                    "one",
-                    "one",
-                    "two",
-                    "two",
-                    "two",
-                    "one",
-                ],
-                "1": [
-                    "dull",
-                    "dull",
-                    "shiny",
-                    "dull",
-                    "dull",
-                    "shiny",
-                    "shiny",
-                    "dull",
-                    "shiny",
-                    "shiny",
-                    "shiny",
-                ],
+                "0": b,
+                "1": c,
             }
         )
         # pandas expects only Series objects, or DataFrames that have only a single column, while
@@ -535,58 +248,11 @@ def eval_func(args_list):
                 eval_func,
             )
 
-    def test_margins(self, dropna):
+    def test_margins(self, dropna, a, b, c):
         query_count = 1
         join_count = 1 if dropna else 2
         union_count = 1
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
+
         with SqlCounter(
             query_count=query_count, join_count=join_count, union_count=union_count
         ):
@@ -605,59 +271,12 @@ def test_margins(self, dropna):
             )
 
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
-    def test_normalize(self, dropna, normalize):
+    def test_normalize(self, dropna, normalize, a, b, c):
         query_count = 1 if normalize in (0, "index") else 2
         join_count = 3 if normalize in (0, "index") else 2
         if dropna:
             join_count -= 2
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
+
         with SqlCounter(query_count=query_count, join_count=join_count):
             eval_snowpark_pandas_result(
                 pd,
@@ -673,7 +292,7 @@ def test_normalize(self, dropna, normalize):
             )
 
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
-    def test_normalize_and_margins(self, dropna, normalize):
+    def test_normalize_and_margins(self, dropna, normalize, a, b, c):
         counts = {
             "columns": [3, 5 if dropna else 9, 4],
             "index": [1, 5 if dropna else 8, 3],
@@ -681,54 +300,7 @@ def test_normalize_and_margins(self, dropna, normalize):
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
+
         if normalize is True:
             sql_counts = counts["all"]
         else:
@@ -754,7 +326,7 @@ def test_normalize_and_margins(self, dropna, normalize):
 
     @pytest.mark.parametrize("normalize", [0, 1, "index", "columns"])
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
-    def test_normalize_margins_and_values(self, dropna, normalize, aggfunc):
+    def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
             "columns": [3, 29 if dropna else 41, 4],
             "index": [1, 23 if dropna else 32, 3],
@@ -762,54 +334,6 @@ def test_normalize_margins_and_values(self, dropna, normalize, aggfunc):
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
         vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
         if normalize is True:
             sql_counts = counts["all"]
@@ -829,7 +353,12 @@ def eval_func(lib):
                 aggfunc=aggfunc,
             )
             if aggfunc == "sum":
-                # Thanks to our hack, the rounding is different for sum.
+                # When normalizing the data, we apply the normalization function to the
+                # entire table (including margins), which requires us to multiply by 2
+                # (since the function takes the sum over the rows, and the margins row is
+                # itself the sum over the rows, causing the sum over all rows to be equal
+                # to 2 * the sum over the input rows). This hack allows us to save on joins
+                # but results in slight precision issues.
                 df = df.round(decimals=6)
             return df
 
@@ -845,55 +374,7 @@ def eval_func(lib):
             )
 
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
-    def test_margins_and_values(self, dropna, aggfunc):
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
+    def test_margins_and_values(self, dropna, aggfunc, a, b, c):
         vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
 
         def eval_func(lib):
@@ -922,7 +403,7 @@ def eval_func(lib):
 
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
-    def test_normalize_and_values(self, dropna, normalize, aggfunc):
+    def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
             "columns": [2, 4 if dropna else 10],
             "index": [1, 5 if dropna else 11],
@@ -930,54 +411,6 @@ def test_normalize_and_values(self, dropna, normalize, aggfunc):
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
         vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
         if normalize is True:
             sql_counts = counts["all"]
@@ -996,7 +429,12 @@ def eval_func(lib):
                 aggfunc=aggfunc,
             )
             if aggfunc in ["sum", "max"]:
-                # Thanks to our hack, the rounding is different for sum and max.
+                # When normalizing the data, we apply the normalization function to the
+                # entire table (including margins), which requires us to multiply by 2
+                # (since the function takes the sum over the rows, and the margins row is
+                # itself the sum over the rows, causing the sum over all rows to be equal
+                # to 2 * the sum over the input rows). This hack allows us to save on joins
+                # but results in slight precision issues.
                 df = df.round(decimals=6)
             return df
 
@@ -1014,56 +452,8 @@ def eval_func(lib):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     @sql_count_checker(query_count=0)
     def test_normalize_margins_and_values_not_supported(
-        self, dropna, normalize, aggfunc
+        self, dropna, normalize, aggfunc, a, b, c
     ):
-        a = np.array(
-            [
-                "foo",
-                "foo",
-                "foo",
-                "foo",
-                "bar",
-                "bar",
-                "bar",
-                "bar",
-                "foo",
-                "foo",
-                "foo",
-            ],
-            dtype=object,
-        )
-        b = np.array(
-            [
-                "one",
-                "one",
-                "one",
-                "two",
-                "one",
-                "one",
-                "one",
-                "two",
-                "two",
-                "two",
-                "one",
-            ],
-            dtype=object,
-        )
-        c = np.array(
-            [
-                "dull",
-                "dull",
-                "shiny",
-                "dull",
-                "dull",
-                "shiny",
-                "shiny",
-                "dull",
-                "shiny",
-                "shiny",
-                "shiny",
-            ],
-            dtype=object,
-        )
         vals = np.array([12, 10, 9, 4, 3, 49, 19, 20, 21, 34, 0])
         with pytest.raises(
             NotImplementedError,
@@ -1082,25 +472,10 @@ def test_normalize_margins_and_values_not_supported(
             )
 
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
-    def test_values(self, dropna, aggfunc):
+    def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 1
         join_count = 2 if dropna else 5
-        native_df = native_pd.DataFrame(
-            {
-                "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-                "favorite_food": [
-                    "chicken",
-                    "fish",
-                    "fish",
-                    "beef",
-                    "chicken",
-                    "beef",
-                    "fish",
-                    "beef",
-                ],
-                "age": [7, 2, 8, 5, 9, 3, 6, 1],
-            }
-        )
+        native_df = basic_crosstab_dfs[0]
 
         with SqlCounter(query_count=query_count, join_count=join_count):
             eval_snowpark_pandas_result(
@@ -1116,26 +491,10 @@ def test_values(self, dropna, aggfunc):
             )
 
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
-    def test_values_series_like(self, dropna, aggfunc):
+    def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 5
         join_count = 2 if dropna else 5
-        native_df = native_pd.DataFrame(
-            {
-                "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-                "favorite_food": [
-                    "chicken",
-                    "fish",
-                    "fish",
-                    "beef",
-                    "chicken",
-                    "beef",
-                    "fish",
-                    "beef",
-                ],
-                "age": [7, 2, 8, 5, 9, 3, 6, 1],
-            }
-        )
-        snow_df = pd.DataFrame(native_df)
+        native_df, snow_df = basic_crosstab_dfs
 
         def eval_func(df):
             if isinstance(df, pd.DataFrame):
@@ -1164,23 +523,8 @@ def eval_func(df):
 
 
 @sql_count_checker(query_count=0)
-def test_values_unsupported_aggfunc():
-    native_df = native_pd.DataFrame(
-        {
-            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-            "favorite_food": [
-                "chicken",
-                "fish",
-                "fish",
-                "beef",
-                "chicken",
-                "beef",
-                "fish",
-                "beef",
-            ],
-            "age": [7, 2, 8, 5, 9, 3, 6, 1],
-        }
-    )
+def test_values_unsupported_aggfunc(basic_crosstab_dfs):
+    native_df = basic_crosstab_dfs[0]
 
     with pytest.raises(
         NotImplementedError,
@@ -1196,26 +540,10 @@ def test_values_unsupported_aggfunc():
 
 
 @sql_count_checker(query_count=4)
-def test_values_series_like_unsupported_aggfunc():
+def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs):
     # The query count above comes from building the DataFrame
     # that we pass in to pivot table.
-    native_df = native_pd.DataFrame(
-        {
-            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
-            "favorite_food": [
-                "chicken",
-                "fish",
-                "fish",
-                "beef",
-                "chicken",
-                "beef",
-                "fish",
-                "beef",
-            ],
-            "age": [7, 2, 8, 5, 9, 3, 6, 1],
-        }
-    )
-    snow_df = pd.DataFrame(native_df)
+    _, snow_df = basic_crosstab_dfs
 
     with pytest.raises(
         NotImplementedError,
diff --git a/tests/integ/modin/crosstab/test_utils.py b/tests/integ/modin/crosstab/test_utils.py
new file mode 100644
index 00000000000..6203419321d
--- /dev/null
+++ b/tests/integ/modin/crosstab/test_utils.py
@@ -0,0 +1,91 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+
+import modin.pandas as pd
+import numpy as np
+import pandas as native_pd
+import pytest
+
+import snowflake.snowpark.modin.plugin  # noqa: F401
+
+
+@pytest.fixture(scope="function")
+def a():
+    return np.array(
+        [
+            "foo",
+            "foo",
+            "foo",
+            "foo",
+            "bar",
+            "bar",
+            "bar",
+            "bar",
+            "foo",
+            "foo",
+            "foo",
+        ],
+        dtype=object,
+    )
+
+
+@pytest.fixture(scope="function")
+def b():
+    return np.array(
+        [
+            "one",
+            "one",
+            "one",
+            "two",
+            "one",
+            "one",
+            "one",
+            "two",
+            "two",
+            "two",
+            "one",
+        ],
+        dtype=object,
+    )
+
+
+@pytest.fixture(scope="function")
+def c():
+    return np.array(
+        [
+            "dull",
+            "dull",
+            "shiny",
+            "dull",
+            "dull",
+            "shiny",
+            "shiny",
+            "dull",
+            "shiny",
+            "shiny",
+            "shiny",
+        ],
+        dtype=object,
+    )
+
+
+@pytest.fixture(scope="function")
+def basic_crosstab_dfs():
+    df = native_pd.DataFrame(
+        {
+            "species": ["dog", "cat", "dog", "dog", "cat", "cat", "dog", "cat"],
+            "favorite_food": [
+                "chicken",
+                "fish",
+                "fish",
+                "beef",
+                "chicken",
+                "beef",
+                "fish",
+                "beef",
+            ],
+            "age": [7, 2, 8, 5, 9, 3, 6, 1],
+        }
+    )
+    return df, pd.DataFrame(df)

From a38bcb1c9650418e2db7679a5ba45c3e206c237a Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 28 Aug 2024 16:20:00 -0700
Subject: [PATCH 17/20] Address review comments

---
 tests/integ/modin/crosstab/{test_utils.py => conftest.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/integ/modin/crosstab/{test_utils.py => conftest.py} (100%)

diff --git a/tests/integ/modin/crosstab/test_utils.py b/tests/integ/modin/crosstab/conftest.py
similarity index 100%
rename from tests/integ/modin/crosstab/test_utils.py
rename to tests/integ/modin/crosstab/conftest.py

From e835540ccc0ec50ba460d36c09d3c1a2c11f6aa8 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Wed, 28 Aug 2024 20:36:30 -0700
Subject: [PATCH 18/20] Fix doc

---
 src/snowflake/snowpark/modin/pandas/general.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index e0afb909fa9..237389bbaac 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2028,7 +2028,7 @@ def crosstab(
 
     Returns
     -------
-    DataFrame
+    Snowpark pandas :class:`~snowflake.snowpark.modin.pandas.DataFrame`
         Cross tabulation of the data.
 
     Notes

From 4e7464cedcb9e4d9ce818295cd827ac983f5b950 Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Fri, 30 Aug 2024 14:10:23 -0700
Subject: [PATCH 19/20] Update coverage

---
 tests/integ/modin/crosstab/test_crosstab.py | 83 ++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 8c228e895da..61d35575185 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -202,7 +202,7 @@ def eval_func(args_list):
             assert_exception_equal=False,  # Our error message is a little different.
         )
 
-    def test_basic_crosstab_with_df_and_series_objs_pandas_errors(
+    def test_basic_crosstab_with_df_and_series_objs_pandas_errors_columns(
         self, dropna, a, b, c
     ):
         query_count = 4
@@ -248,6 +248,52 @@ def eval_func(args_list):
                 eval_func,
             )
 
+    def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index(
+        self, dropna, a, b, c
+    ):
+        query_count = 4
+        join_count = 1 if dropna else 3
+        a = native_pd.Series(
+            a,
+            dtype=object,
+        )
+        b = native_pd.DataFrame(
+            {
+                "0": b,
+                "1": c,
+            }
+        )
+        # pandas expects only Series objects, or DataFrames that have only a single column, while
+        # we support accepting DataFrames with multiple columns.
+        with pytest.raises(
+            AssertionError, match="arrays and names must have the same length"
+        ):
+            native_pd.crosstab(b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna)
+
+        def eval_func(args_list):
+            a, b = args_list
+            if isinstance(a, native_pd.Series):
+                return native_pd.crosstab(
+                    [b[c] for c in b.columns],
+                    a,
+                    rownames=["a", "b"],
+                    colnames=["c"],
+                    dropna=dropna,
+                )
+            else:
+                return pd.crosstab(
+                    b, a, rownames=["a", "b"], colnames=["c"], dropna=dropna
+                )
+
+        with SqlCounter(query_count=query_count, join_count=join_count):
+            native_args = [a, b]
+            snow_args = [pd.Series(a), pd.DataFrame(b)]
+            eval_snowpark_pandas_result(
+                snow_args,
+                native_args,
+                eval_func,
+            )
+
     def test_margins(self, dropna, a, b, c):
         query_count = 1
         join_count = 1 if dropna else 2
@@ -556,3 +602,38 @@ def test_values_series_like_unsupported_aggfunc(basic_crosstab_dfs):
             aggfunc="median",
             dropna=False,
         )
+
+
+@sql_count_checker(query_count=0)
+def test_values_aggfunc_one_supplied_should_error(a, b, c):
+    eval_snowpark_pandas_result(
+        pd,
+        native_pd,
+        lambda lib: lib.crosstab(index=a, columns=b, aggfunc="sum"),
+        expect_exception=True,
+        expect_exception_match="aggfunc cannot be used without values.",
+        expect_exception_type=ValueError,
+        assert_exception_equal=True,
+    )
+    eval_snowpark_pandas_result(
+        pd,
+        native_pd,
+        lambda lib: lib.crosstab(index=a, columns=b, values=c),
+        expect_exception=True,
+        expect_exception_match="values cannot be used without an aggfunc.",
+        expect_exception_type=ValueError,
+        assert_exception_equal=True,
+    )
+
+
+@sql_count_checker(query_count=0)
+def test_invalid_normalize(a, b):
+    eval_snowpark_pandas_result(
+        pd,
+        native_pd,
+        lambda lib: lib.crosstab(index=a, columns=b, normalize="invalid_value"),
+        expect_exception=True,
+        expect_exception_match="Not a valid normalize argument: invalid_value",
+        expect_exception_type=ValueError,
+        assert_exception_equal=True,
+    )

From 1430d1b50458fca5441d27ba37b664d4a426846e Mon Sep 17 00:00:00 2001
From: Rehan Durrani <rehan.durrani@snowflake.com>
Date: Fri, 30 Aug 2024 14:48:37 -0700
Subject: [PATCH 20/20] Fix tests

---
 src/snowflake/snowpark/modin/pandas/general.py | 2 +-
 tests/integ/modin/crosstab/test_crosstab.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index fb3822e5d54..df19e9eac91 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2235,7 +2235,7 @@ def _get_names_wrapper(list_of_objs, names, prefix):
     # as a valid value of normalize is `0` (for normalizing index).
     if normalize is not False:
         if normalize not in [0, 1, "index", "columns", "all", True]:
-            raise ValueError(f"Not a valid normalize argument: {normalize}")
+            raise ValueError("Not a valid normalize argument")
         if normalize is True:
             normalize = "all"
         normalize = {0: "index", 1: "columns"}.get(normalize, normalize)
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
index 61d35575185..276650519d9 100644
--- a/tests/integ/modin/crosstab/test_crosstab.py
+++ b/tests/integ/modin/crosstab/test_crosstab.py
@@ -251,8 +251,8 @@ def eval_func(args_list):
     def test_basic_crosstab_with_df_and_series_objs_pandas_errors_index(
         self, dropna, a, b, c
     ):
-        query_count = 4
-        join_count = 1 if dropna else 3
+        query_count = 6
+        join_count = 5 if dropna else 17
         a = native_pd.Series(
             a,
             dtype=object,
@@ -633,7 +633,7 @@ def test_invalid_normalize(a, b):
         native_pd,
         lambda lib: lib.crosstab(index=a, columns=b, normalize="invalid_value"),
         expect_exception=True,
-        expect_exception_match="Not a valid normalize argument: invalid_value",
+        expect_exception_match="Not a valid normalize argument",
         expect_exception_type=ValueError,
         assert_exception_equal=True,
     )