diff --git a/CHANGELOG.md b/CHANGELOG.md index 23eaea959ff..12548b706ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,20 @@ ## 1.18.0 (TBD) +### Snowpark Python API Updates + +#### New Features + +#### Improvements + +### Snowpark pandas API Updates + +#### New Features + +#### Improvements + +- Added partial support for `DataFrame.pivot_table` with no `index` parameter, as well as for `margins` parameter. + ### Snowpark Local Testing Updates #### Bug Fixes diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst index a613616f23c..bf752172221 100644 --- a/docs/source/modin/supported/dataframe_supported.rst +++ b/docs/source/modin/supported/dataframe_supported.rst @@ -297,9 +297,14 @@ Methods +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``pivot`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ -| ``pivot_table`` | P | ``observed``, ``margins``, | ``N`` if ``index``, ``columns``, or ``values`` is | -| | | ``sort`` | not str; or MultiIndex; or any ``argfunc`` is not | -| | | | "count", "mean", "min", "max", or "sum" | +| ``pivot_table`` | P | ``observed``, ``sort`` | ``N`` if ``index``, ``columns``, or ``values`` is | +| | | | not str, list of str, or None; or MultiIndex; or | +| | | | any ``argfunc`` is not "count", "mean", "min", | +| | | | "max", or "sum". N if ``index`` is None, | +| | | | ``margins`` is True and ``aggfunc`` is "count" | +| | | | or "mean" or a dictionary. N if ``index`` is None | +| | | | and ``aggfunc`` is a dictionary containing | +| | | | lists of aggfuncs to apply. | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ | ``pop`` | N | | | +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+ diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 448132ab9cd..4807073b41d 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -533,12 +533,20 @@ def pivot_table( Notes ----- - Raise NotImplementedError if + - Raise NotImplementedError if - * margins, observed, or sort is given; - * or index, columns, or values is not str; + * observed or sort is given; + * or index, columns, or values is not str, a list of str, or None; * or DataFrame contains MultiIndex; - * or any argfunc is not "count", "mean", "min", "max", or "sum" + * or any aggfunc is not "count", "mean", "min", "max", or "sum" + * index is None, and aggfunc is a dictionary containing lists. + + - Computing margins with no index has limited support: + * when aggfunc is "count" or "mean" the result has discrepancies with pandas - + Snowpark pandas computes the aggfunc over the data grouped by the first pivot + column, while pandas computes the aggfunc over the result of the aggfunc from + the initial pivot. + * aggfunc as a dictionary is not supported. See Also -------- diff --git a/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py index 3b27cb2609c..f4ba89b160a 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py @@ -5,8 +5,10 @@ from collections.abc import Generator, Hashable from functools import reduce from itertools import product -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, NamedTuple, Optional, Union +import numpy as np +import pandas as pd from pandas._typing import AggFuncType, AggFuncTypeBase, Scalar from snowflake.snowpark.column import Column as SnowparkColumn @@ -61,6 +63,94 @@ ) +class PivotedOrderedDataFrameResult(NamedTuple): + # The OrderedDataFrame representation for the join or align result + ordered_dataframe: OrderedDataFrame + # The data column pandas labels of the new frame. + data_column_pandas_labels: list[Hashable] + # The data column snowflake quoted identifiers of the new frame. + data_column_snowflake_quoted_identifiers: list[str] + + +def perform_pivot_and_concatenate( + ordered_dataframe: OrderedDataFrame, + pivot_aggr_groupings: list[PivotAggrGrouping], + groupby_snowflake_quoted_identifiers: list[str], + pivot_snowflake_quoted_identifiers: list[str], + should_join_along_columns: bool, +) -> PivotedOrderedDataFrameResult: + """ + Helper function to perform a full pivot (including joining in the case of multiple aggrs or values) on an OrderedDataFrame. + + Args: + ordered_dataframe: The ordered dataframe to perform pivot on. + pivot_aggr_groupings: A list of PivotAggrGroupings that define the aggregations to apply. + groupby_snowflake_quoted_identifiers: Group by identifiers + pivot_snowflake_quoted_identifiers: Pivot identifiers + should_join_along_columns: Whether to join along columns, or use union to join along rows instead. + """ + last_ordered_dataframe = None + data_column_pandas_labels: list[Hashable] = [] + data_column_snowflake_quoted_identifiers: list[str] = [] + for pivot_aggr_grouping in pivot_aggr_groupings: + existing_snowflake_quoted_identifiers = groupby_snowflake_quoted_identifiers + if last_ordered_dataframe is not None and should_join_along_columns: + # If there are no index columns, then we append the OrderedDataFrame's vertically, rather + # than horizontally, so we do not need to dedupe the columns (and in fact we want the columns + # to have the same name since we want them to match up during the union. + existing_snowflake_quoted_identifiers = ( + last_ordered_dataframe.projected_column_snowflake_quoted_identifiers + ) + + ( + new_pivot_ordered_dataframe, + new_data_column_snowflake_quoted_identifiers, + new_data_column_pandas_labels, + ) = single_pivot_helper( + ordered_dataframe, + existing_snowflake_quoted_identifiers, + groupby_snowflake_quoted_identifiers, + pivot_snowflake_quoted_identifiers, + pivot_aggr_grouping.aggr_label_identifier_pair, + pivot_aggr_grouping.aggfunc, + pivot_aggr_grouping.prefix_label, + ) + + if last_ordered_dataframe: + # If there are index columns, then we join the two OrderedDataFrames + # (horizontally), while if there are no index columns, we concatenate + # them vertically, and have the index be the value column each row + # corresponds to. + # We also join vertically if there are multiple columns and multiple + # pivot values. + if should_join_along_columns: + last_ordered_dataframe = last_ordered_dataframe.join( + right=new_pivot_ordered_dataframe, + left_on_cols=groupby_snowflake_quoted_identifiers, + right_on_cols=groupby_snowflake_quoted_identifiers, + how="left", + ) + data_column_snowflake_quoted_identifiers.extend( + new_data_column_snowflake_quoted_identifiers + ) + data_column_pandas_labels.extend(new_data_column_pandas_labels) + else: + last_ordered_dataframe = last_ordered_dataframe.union_all( + new_pivot_ordered_dataframe + ) + else: + last_ordered_dataframe = new_pivot_ordered_dataframe + data_column_snowflake_quoted_identifiers.extend( + new_data_column_snowflake_quoted_identifiers + ) + data_column_pandas_labels.extend(new_data_column_pandas_labels) + return PivotedOrderedDataFrameResult( + last_ordered_dataframe, + data_column_pandas_labels, + data_column_snowflake_quoted_identifiers, + ) + + def pivot_helper( pivot_frame: InternalFrame, pivot_aggr_groupings: list[PivotAggrGrouping], @@ -69,6 +159,8 @@ def pivot_helper( columns: Any, groupby_snowflake_quoted_identifiers: list[str], pivot_snowflake_quoted_identifiers: list[str], + multiple_aggr_funcs: bool, + multiple_values: bool, index: Optional[list], ) -> InternalFrame: """ @@ -82,6 +174,8 @@ def pivot_helper( columns: The columns argument passed to `pivot_table`. Will become the pandas labels for the data column index. groupby_snowflake_quoted_identifiers: Group by identifiers pivot_snowflake_quoted_identifiers: Pivot identifiers + multiple_aggr_funcs: Whether multiple aggregation functions have been passed in. + multiple_values: Whether multiple values columns have been passed in. index: The index argument passed to `pivot_table` if specified. Will become the pandas labels for the index column. Returns: InternalFrame @@ -100,7 +194,6 @@ def pivot_helper( if ordered_dataframe.queries.get("post_actions"): ordered_dataframe = cache_result(ordered_dataframe) - last_ordered_dataframe = None data_column_pandas_labels: list[Hashable] = [] data_column_snowflake_quoted_identifiers: list[str] = [] @@ -157,47 +250,91 @@ def pivot_helper( # # The multi-level pandas prefix label that includes the aggregation value and function labels is also # constructed and passed into the single pivot operation to prepend the remaining of the pandas labels. - for pivot_aggr_grouping in pivot_aggr_groupings: - existing_snowflake_quoted_identifiers = groupby_snowflake_quoted_identifiers - if last_ordered_dataframe is not None: - existing_snowflake_quoted_identifiers = ( - last_ordered_dataframe.projected_column_snowflake_quoted_identifiers + if ( + len(groupby_snowflake_quoted_identifiers) == 0 + and multiple_aggr_funcs + and multiple_values + ): + # When there are multiple aggregation functions, values, and `index=None`, we need + # to handle pivot a little differently. Rather than just joining horizontally or vertically, + # we need to join both horizontally and vertically - each value column gets its own row, so + # for every resulting OrderedDataFrame corresponding to the result of an aggregation on a single + # value, we need to join (concatenate horizontally) to get one row. For every value column, + # we then need to union (concatenate vertically) the resulting rows from the previous step. + # In order to handle this, we first group the aggregations by the column they act on, and run + # one pivot per group of aggregations. We then have multiple one row OrderedDataFrames, where each + # OrderedDataFrame is the result of pivot on a single value column, which we can union in order to + # get our final result. + # Step 1: Determine the values columns. + values_pandas_labels = { + pair.aggr_label_identifier_pair.pandas_label + for pair in pivot_aggr_groupings + } + # Step 2: Group aggregations by the values column they are on. + # Result: {"val_col1": [aggr1, aggr2], "val_col2}": [aggr3, aggr4]} + grouped_pivot_aggr_groupings = { + v: list( + filter( + lambda pair: pair.aggr_label_identifier_pair.pandas_label == v, + pivot_aggr_groupings, + ) ) - + for v in values_pandas_labels + } + # Step 5: Perform pivot for every value column, and union together. + last_ordered_dataframe = None + for value_column in values_pandas_labels: + ( + pivot_ordered_dataframe, + new_data_column_pandas_labels, + new_data_column_snowflake_quoted_identifiers, + ) = perform_pivot_and_concatenate( + ordered_dataframe, + grouped_pivot_aggr_groupings[value_column], + groupby_snowflake_quoted_identifiers, + pivot_snowflake_quoted_identifiers, + True, + ) + if last_ordered_dataframe is None: + last_ordered_dataframe = pivot_ordered_dataframe + data_column_pandas_labels = new_data_column_pandas_labels + data_column_snowflake_quoted_identifiers = ( + new_data_column_snowflake_quoted_identifiers + ) + else: + last_ordered_dataframe = last_ordered_dataframe.union_all( + pivot_ordered_dataframe + ) + assert ( + new_data_column_pandas_labels == data_column_pandas_labels + ), "Labels should match when doing multiple values and multiple aggregation functions and no index." + ordered_dataframe = last_ordered_dataframe + else: + # If there are no index columns (groupby_snowflake_quoted_identifiers) and + # a single aggregation function or a single value, we should join vertically + # instead of horizontally. + should_join_along_columns = len(groupby_snowflake_quoted_identifiers) > 0 or ( + multiple_aggr_funcs and not multiple_values + ) ( - new_pivot_ordered_dataframe, - new_data_column_snowflake_quoted_identifiers, - new_data_column_pandas_labels, - ) = single_pivot_helper( ordered_dataframe, - existing_snowflake_quoted_identifiers, + data_column_pandas_labels, + data_column_snowflake_quoted_identifiers, + ) = perform_pivot_and_concatenate( + ordered_dataframe, + pivot_aggr_groupings, groupby_snowflake_quoted_identifiers, pivot_snowflake_quoted_identifiers, - pivot_aggr_grouping.aggr_label_identifier_pair, - pivot_aggr_grouping.aggfunc, - pivot_aggr_grouping.prefix_label, - ) - - if last_ordered_dataframe: - last_ordered_dataframe = last_ordered_dataframe.join( - right=new_pivot_ordered_dataframe, - left_on_cols=groupby_snowflake_quoted_identifiers, - right_on_cols=groupby_snowflake_quoted_identifiers, - how="left", - ) - else: - last_ordered_dataframe = new_pivot_ordered_dataframe - - data_column_snowflake_quoted_identifiers.extend( - new_data_column_snowflake_quoted_identifiers + should_join_along_columns, ) - data_column_pandas_labels.extend(new_data_column_pandas_labels) - ordered_dataframe = last_ordered_dataframe + # When there are no groupby columns, the index is the first column in the OrderedDataFrame. + # Otherwise, the index is the groupby columns. + length_of_index_columns = max(1, len(groupby_snowflake_quoted_identifiers)) index_column_snowflake_quoted_identifiers = ( ordered_dataframe.projected_column_snowflake_quoted_identifiers[ - 0 : len(groupby_snowflake_quoted_identifiers) + 0:length_of_index_columns ] ) index = index or [None] * len(index_column_snowflake_quoted_identifiers) @@ -299,9 +436,7 @@ def single_pivot_helper( project_snowflake_quoted_identifiers ) - index_snowflake_quoted_identifiers = ( - groupby_snowflake_quoted_identifiers or pivot_snowflake_quoted_identifiers or [] - ) + index_snowflake_quoted_identifiers = groupby_snowflake_quoted_identifiers or [] if not pivot_snowflake_quoted_identifiers or not aggr_snowflake_quoted_identifier: if not groupby_snowflake_quoted_identifiers: @@ -400,6 +535,7 @@ def single_pivot_helper( ), "*", ) + index_snowflake_quoted_identifiers = [pivot_snowflake_quoted_identifiers[0]] # Go through each of the non-group by columns and # 1. Generate corresponding pandas label (without prefix) @@ -686,7 +822,6 @@ def generate_single_pivot_labels( if not pandas_aggfunc_list: continue - # 2. Loop through all aggregation functions for this aggregation value. for pandas_single_aggr_func in pandas_aggfunc_list: # pandas only adds aggregation value as label if provided as a list @@ -999,6 +1134,175 @@ def get_margin_aggregation( return aggfunc_expr +def expand_pivot_result_with_pivot_table_margins_no_groupby_columns( + pivot_qc: "SnowflakeQueryCompiler", # type: ignore[name-defined] # noqa: F821 + original_modin_frame: InternalFrame, + pivot_aggr_groupings: list[PivotAggrGrouping], + dropna: bool, + columns: list[str], + aggfunc: AggFuncType, + pivot_snowflake_quoted_identifiers: list[str], + values: list[str], + margins_name: str, +) -> "SnowflakeQueryCompiler": # type: ignore[name-defined] # noqa: F821 + names = pivot_qc.columns.names + margins_frame = pivot_helper( + original_modin_frame, + pivot_aggr_groupings, + not dropna, + not isinstance(aggfunc, list), + columns[:1], + [], # There are no groupby_snowflake_quoted_identifiers + pivot_snowflake_quoted_identifiers[:1], + (isinstance(aggfunc, list) and len(aggfunc) > 1), + (isinstance(values, list) and len(values) > 1), + None, # There is no index. + ) + if len(columns) > 1: + # If there is a multiindex on the pivot result, we need to add the margin_name to the margins frame's data column + # pandas labels, as well as any empty postfixes for the remaining pivot columns if there are more than 2. + new_data_column_pandas_labels = [] + for label in margins_frame.data_column_pandas_labels: + if isinstance(aggfunc, list): + new_label = label + (margins_name,) + else: + new_label = (label, margins_name) + tuple( + "" for _ in range(pivot_qc.columns.nlevels - 2) + ) + new_data_column_pandas_labels.append(new_label) + margins_frame = InternalFrame.create( + ordered_dataframe=margins_frame.ordered_dataframe, + data_column_pandas_labels=new_data_column_pandas_labels, + data_column_pandas_index_names=pivot_qc._modin_frame.data_column_pandas_index_names, + data_column_snowflake_quoted_identifiers=margins_frame.data_column_snowflake_quoted_identifiers, + index_column_pandas_labels=margins_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=margins_frame.index_column_snowflake_quoted_identifiers, + ) + + # Need to create a QueryCompiler for the margins frame, but SnowflakeQueryCompiler is not present in this scope + # so we use this workaround instead. + margins_qc = type(pivot_qc)(margins_frame) + original_pivot_qc_columns = pivot_qc.columns + pivot_qc = pivot_qc.concat(1, [margins_qc]) + # After this step, pivot_qc contains the pivotted columns followed by the margins columns - e.g. say our pivot result is + # B on.e tw"o + # D 28 27 + # E 35 31 + # Then our pivotted query_compiler now looks like this: + # B on.e tw"o margin_for_on.e margin_for_tw"o + # D 28 27 28 27 + # E 35 31 35 31 + # We have to reindex (and rename, since we used pivot, the columns will be named the same) so that we get it in the format: + # B on.e margin_for_on.e tw"o margin_for_tw"o + # D 28 28 27 27 + # E 35 35 31 31 + # If there are more than one pivot columns, then the stride will be greater - e.g. if our pivot result looks like this: + # B on.e tw"o + # C dull shi'ny dull shi'ny + # D 5 23 10 17 + # E 8 27 12 19 + # Our pivotted query_compiler will look like this: + # B on.e tw"o on.e tw"o + # C dull shi'ny dull shi'ny All All + # D 5 23 10 17 28 27 + # E 8 27 12 19 35 21 + # And so our re-indexer will look different. + if len(columns) == 1: + # Assuming we have 4 columns after the pivot, we want our reindexer to look like this: [0, 4, 1, 5, 2, 6, 3, 7]. We can accomplish this + # by zipping(range(0, 4), (4, 8)), which gives us [(0, 4), (1, 5), (2, 6), (3, 7)], and then flattening that list using sum(list, tuple()) + # which will result in our flattened indexer [0, 4, 1, 5, 2, 6, 3, 7]. + column_reindexer = list( + sum( + zip( + range(0, len(original_pivot_qc_columns)), + range( + len(original_pivot_qc_columns), + 2 * len(original_pivot_qc_columns), + ), + ), + tuple(), + ) + ) + else: + # When there is more than one pivot column, we need to reindex differently, as the example above shows. Say we have have 2 unique values in + # the first pivot column, and 2 unique values in the second pivot column (as above). Then, our final reindexer should look like this: + # [0, 1, 4, 2, 3, 5]. We can determine how many columns correspond to each first pivot column value by looking at the column MultiIndex for + # the pivotted QC. We can convert that to a frame using the `to_frame` MultiIndex API. Let's take a look at an example. + # Assuming that the MultiIndex (after converting to a frame) looks like this (i.e. there are 2 distinct values for the first pivot column, + # and 3 for the second): + # B C + # 0 on.e dull + # 1 on.e shi'ny + # 2 on.e sy + # 3 tw"o dull + # 4 tw"o shi'ny + mi_as_frame = original_pivot_qc_columns.to_frame(index=False) + # We can then groupby the first pivot column, and call count, which will tell us how many columns correspond to each label from the first pivot column. + # C + # B + # on.e 3 + # tw"o 2 + # If there are multiple columns and multiple aggregation functions, we need to groupby the first two columns instead of just the first one - + # as the first column will be the name of the aggregation function, and the second column will be the values from the first pivot column. + if isinstance(aggfunc, list): + groupby_columns = mi_as_frame.columns[:2].tolist() + value_column_index = 2 + else: + groupby_columns = mi_as_frame.columns[0] + value_column_index = 1 + pivot_multiindex_level_one_lengths = np.cumsum( + mi_as_frame.groupby(groupby_columns, sort=False) + .count()[mi_as_frame.columns[value_column_index]] + .values[:-1] + ) + # We can grab the first column from this groupby (in case there are more than 2 pivot columns), and use these splits with np.split, which will tell us + # the groupings of the columns. E.g., in this case, we would want the following splits for the indexes: [(0, 1, 2), (3, 4)]. Calling np.split with + # the values from above (excluding the last value) will result in that output. We call tuple on the splits to get them in tuple format. + split_original_pivot_qc_indexes = [ + list(group) + for group in np.split( + range(len(original_pivot_qc_columns)), + pivot_multiindex_level_one_lengths, + ) + ] + # Once we have the splits [[0, 1, 2], [3, 4]], we can then insert the indices for the margins columns. + reindexer = [ + group + [margin_index] + for group, margin_index in zip( + split_original_pivot_qc_indexes, + range(len(original_pivot_qc_columns), len(pivot_qc.columns)), + ) + ] + # Now, we have a list that looks like this: [[0, 1, 2, 5], [3, 4, 6]] - we need to make this into a flat list of indexes. + column_reindexer = sum(reindexer, list()) + pivot_qc = pivot_qc.take_2d_positional(slice(None), column_reindexer) + + if len(columns) == 1: + # After reindexing, we have to rename the margins columns to the correct name if we only have one pivot column. + if original_pivot_qc_columns.nlevels == 1: + pivot_qc = pivot_qc.set_columns( + pd.Index( + list( + sum( + zip( + original_pivot_qc_columns, + [margins_name] * len(original_pivot_qc_columns), + ), + tuple(), + ) + ) + ).set_names(names) + ) + else: + # If there are multiple levels in the index even though there is a single pivot column, we need to copy over the prefixes as well. + new_index_names = [] + for label in original_pivot_qc_columns: + new_index_names.extend([label, label[:-1] + (margins_name,)]) + new_index = pd.MultiIndex.from_tuples(new_index_names).set_names(names) + pivot_qc = pivot_qc.set_columns(new_index) + return pivot_qc + + def expand_pivot_result_with_pivot_table_margins( pivot_aggr_groupings: list[PivotAggrGrouping], groupby_snowflake_quoted_identifiers: list[str], diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index a3c9b1c1075..a60b667de67 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -221,6 +221,7 @@ ) from snowflake.snowpark.modin.plugin._internal.pivot_utils import ( expand_pivot_result_with_pivot_table_margins, + expand_pivot_result_with_pivot_table_margins_no_groupby_columns, generate_pivot_aggregation_value_label_snowflake_quoted_identifier_mappings, generate_single_pivot_labels, pivot_helper, @@ -6340,14 +6341,19 @@ def pivot_table( ): raise TypeError("Must provide 'func' or named aggregation **kwargs.") - # With margins, a dictionary aggfunc that maps to list of aggregations is not supported by pandas. We return - # friendly error message in this case. - if ( - margins - and isinstance(aggfunc, dict) - and any(not isinstance(af, str) for af in aggfunc.values()) + if isinstance(aggfunc, dict) and any( + not isinstance(af, str) for af in aggfunc.values() ): - raise ValueError("Margins not supported if list of aggregation functions") + # With margins, a dictionary aggfunc that maps to list of aggregations is not supported by pandas. We return + # friendly error message in this case. + if margins: + raise ValueError( + "Margins not supported if list of aggregation functions" + ) + elif index is None: + raise NotImplementedError( + "Not implemented index is None and list of aggregation functions." + ) # Duplicate pivot column and index are not allowed, but duplicate aggregation values are supported. index_and_data_column_pandas_labels = ( @@ -6384,11 +6390,6 @@ def pivot_table( else [] ) - if len(groupby_snowflake_quoted_identifiers) == 0: - raise NotImplementedError( - "pivot_table with no index configuration is currently not supported" - ) - if values is None: # If no values (aggregation columns) are specified, then we use all data columns that are neither # groupby (index) nor pivot columns as the aggregation columns. For example, a dataframe with @@ -6409,13 +6410,21 @@ def pivot_table( values, self._modin_frame ) ) - + multiple_agg_funcs_single_values = ( + isinstance(aggfunc, list) and len(aggfunc) > 1 + ) and not isinstance(values, list) + include_aggr_func_in_label = ( + len(groupby_snowflake_quoted_identifiers) != 0 + or multiple_agg_funcs_single_values + ) pivot_aggr_groupings = list( generate_single_pivot_labels( values_label_to_identifier_pairs_list, aggfunc, len(pivot_snowflake_quoted_identifiers) > 0, - isinstance(values, list), + isinstance(values, list) + and (not margins or len(values) > 1) + and include_aggr_func_in_label, sort, ) ) @@ -6429,6 +6438,8 @@ def pivot_table( columns, groupby_snowflake_quoted_identifiers, pivot_snowflake_quoted_identifiers, + (isinstance(aggfunc, list) and len(aggfunc) > 1), + (isinstance(values, list) and len(values) > 1), index, ) @@ -6446,16 +6457,39 @@ def pivot_table( # Add margins if specified, note this will also add the row position since the margin row needs to be fixed # as the last row of the dataframe. If no margins, then we order by the group by columns. - if margins and pivot_aggr_groupings and pivot_snowflake_quoted_identifiers: - pivot_qc = expand_pivot_result_with_pivot_table_margins( - pivot_aggr_groupings, - groupby_snowflake_quoted_identifiers, - pivot_snowflake_quoted_identifiers, - self._modin_frame.ordered_dataframe, - pivot_qc, - margins_name, - fill_value, - ) + # The final condition checks to see if there are any columns in the pivot result. If there are no columns, + # this means that we pivoted on an empty table - in that case, we can skip adding margins, since the result + # will still be an empty DataFrame (but we will have increased the join and union count) for no reason. + if ( + margins + and pivot_aggr_groupings + and pivot_snowflake_quoted_identifiers + and len(pivot_qc.columns) != 0 + ): + if len(groupby_snowflake_quoted_identifiers) > 0: + pivot_qc = expand_pivot_result_with_pivot_table_margins( + pivot_aggr_groupings, + groupby_snowflake_quoted_identifiers, + pivot_snowflake_quoted_identifiers, + self._modin_frame.ordered_dataframe, + pivot_qc, + margins_name, + fill_value, + ) + else: + pivot_qc = ( + expand_pivot_result_with_pivot_table_margins_no_groupby_columns( + pivot_qc, + self._modin_frame, + pivot_aggr_groupings, + dropna, + columns, + aggfunc, + pivot_snowflake_quoted_identifiers, + values, + margins_name, + ) + ) # Rename the data column snowflake quoted identifiers to be closer to pandas labels given we # may have done unwrapping of surrounding quotes, ie. so will unwrap single quotes in snowflake identifiers. @@ -9664,7 +9698,7 @@ def count_freqs( Helper function to compute the mode ("top") and frequency with which the mode appears ("count") for a given column. - This helper returns a 1-row OrderedFrame with the columns "__index__", "top" and "freq", + This helper returns a 1-row OrderedDataFrame with the columns "__index__", "top" and "freq", containing the column name, the mode of this column, and the number of times the mode occurs. This result should be UNION ALL'd together with the results from the other columns of the original frame, then transposed so "top" and "freq" are rows. @@ -9687,7 +9721,7 @@ def count_freqs( assert len(col_labels_tuple) == len( new_index_identifiers ), f"level of labels {col_labels_tuple} did not match level of identifiers {new_index_identifiers}" - # The below OrderedFrame operations are analogous to the following SQL for column "a": + # The below OrderedDataFrame operations are analogous to the following SQL for column "a": # SELECT 'a' AS __index__, # a::VARIANT AS top, # IFF(a IS NULL, NULL, COUNT(a)) AS freq diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 8965ae2cc37..1ce52fe6793 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -1962,12 +1962,20 @@ def pivot_table(): Notes ----- - Raise NotImplementedError if + - Raise NotImplementedError if - * margins, observed, or sort is given; - * or index, columns, or values is not str; + * observed or sort is given; + * or index, columns, or values is not str, a list of str, or None; * or DataFrame contains MultiIndex; - * or any argfunc is not "count", "mean", "min", "max", or "sum" + * or any aggfunc is not "count", "mean", "min", "max", or "sum" + * index is None, and aggfunc is a dictionary containing lists. + + - Computing margins with no index has limited support: + * when aggfunc is "count" or "mean" the result has discrepancies with pandas - + Snowpark pandas computes the aggfunc over the data grouped by the first pivot + column, while pandas computes the aggfunc over the result of the aggfunc from + the initial pivot. + * aggfunc as a dictionary is not supported. See Also -------- diff --git a/tests/integ/modin/pivot/conftest.py b/tests/integ/modin/pivot/conftest.py index d099d389130..17e350c64a9 100644 --- a/tests/integ/modin/pivot/conftest.py +++ b/tests/integ/modin/pivot/conftest.py @@ -53,6 +53,69 @@ def df_data(): } +@pytest.fixture(scope="module") +def df_data_more_pivot_values(): + return { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "on.e", + "on.e", + "on.e", + 'tw"o', + "on.e", + "on.e", + "on.e", + 'tw"o', + 'tw"o', + 'tw"o', + "on.e", + "thr.ee", + "thr.ee", + "thr.ee", + "on.e", + 'tw"o', + ], + "C": [ + "dull", + "dull", + "shi'ny", + "dull", + "dull", + "shi'ny", + "shi'ny", + "dull", + "shi'ny", + "shi'ny", + "shi'ny", + "dull", + "shi'ny", + "pla.in", + "pla.in", + "pla.in", + ], + "D": np.arange(0, 16), + "E": np.arange(1, 17), + "F": np.arange(2, 18), + } + + @pytest.fixture(scope="module") def df_data_with_duplicates(): return ( diff --git a/tests/integ/modin/pivot/test_pivot_margins.py b/tests/integ/modin/pivot/test_pivot_margins.py index 354f1377c07..81ebaff6839 100644 --- a/tests/integ/modin/pivot/test_pivot_margins.py +++ b/tests/integ/modin/pivot/test_pivot_margins.py @@ -10,18 +10,25 @@ from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +@pytest.mark.parametrize("index", [None, "A"], ids=["no_index", "single_index"]) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("columns", ["C", ["B", "C"]]) @pytest.mark.parametrize("fill_value", [None, 99.99]) def test_pivot_table_single_with_dropna_options( - df_data_with_nulls, dropna, columns, fill_value + df_data_with_nulls, index, dropna, columns, fill_value ): expected_join_count = 2 if not dropna else 1 + if not dropna and index is None: + expected_join_count += 1 + if len(columns) > 1 and index is None: + pytest.xfail( + reason="SNOW-1435365 - pandas computes values differently than us: https://github.com/pandas-dev/pandas/issues/58722." + ) with SqlCounter(query_count=1, join_count=expected_join_count): pivot_table_test_helper( df_data_with_nulls, { - "index": "A", + "index": index, "columns": columns, "values": "D", "dropna": dropna, @@ -31,6 +38,39 @@ def test_pivot_table_single_with_dropna_options( ) +# Not marking as strict since the following test cases pass: +# [None-C-True-no_index] +# [None-columns1-True-no_index] +# [None-C-False-no_index] +# [None-columns1-False-no_index] +@pytest.mark.xfail( + reason="SNOW-1435365 - we do not support margins=True, with no index and aggfunc as a dictionary." +) +@pytest.mark.parametrize("index", [None, "A"], ids=["no_index", "single_index"]) +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("columns", ["C", ["B", "C"]]) +@pytest.mark.parametrize("fill_value", [None, 99.99]) +def test_pivot_table_single_with_dropna_options_multiple_aggr_funcs( + df_data_with_nulls, index, dropna, columns, fill_value +): + expected_join_count = 2 if not dropna else 1 + if not dropna and index is None: + expected_join_count += 1 + with SqlCounter(query_count=1, join_count=expected_join_count): + pivot_table_test_helper( + df_data_with_nulls, + { + "index": index, + "columns": columns, + "values": ["D", "E"], + "dropna": dropna, + "fill_value": fill_value, + "margins": True, + "aggfunc": {"D": "sum", "E": "max"}, + }, + ) + + @pytest.mark.parametrize( "aggfunc", [ @@ -66,6 +106,20 @@ def test_pivot_table_multiple_columns_values_with_margins( ) +@pytest.mark.parametrize( + "index", + [ + pytest.param( + None, + marks=pytest.mark.xfail( + strict=True, + reason="SNOW-1435365 - pandas computes values differently than us: https://github.com/pandas-dev/pandas/issues/58722.", + ), + ), + ["A", "B"], + ], + ids=["no_index", "multiple_index"], +) @pytest.mark.parametrize( "fill_value", [ @@ -82,12 +136,12 @@ def test_pivot_table_multiple_columns_values_with_margins( ) @sql_count_checker(query_count=1, join_count=9, union_count=1) def test_pivot_table_multiple_pivot_values_null_data_with_margins( - df_data_with_nulls, fill_value + df_data_with_nulls, index, fill_value ): pivot_table_test_helper( df_data_with_nulls, { - "index": ["A", "B"], + "index": index, "columns": "C", "values": "F", "aggfunc": ["count", "sum", "mean"], @@ -99,6 +153,9 @@ def test_pivot_table_multiple_pivot_values_null_data_with_margins( ) +@pytest.mark.parametrize( + "index", [None, ["A", "B"]], ids=["no_index", "multiple_index"] +) @pytest.mark.parametrize( "fill_value", [ @@ -113,23 +170,25 @@ def test_pivot_table_multiple_pivot_values_null_data_with_margins( ), ], ) -@sql_count_checker(query_count=1, join_count=6, union_count=1) def test_pivot_table_multiple_pivot_values_null_data_with_margins_nan_blocked( - df_data_with_nulls, fill_value + df_data_with_nulls, index, fill_value ): - pivot_table_test_helper( - df_data_with_nulls, - { - "index": ["A", "B"], - "columns": "C", - "values": "F", - "aggfunc": ["min", "max"], - "dropna": False, - "fill_value": fill_value, - "margins": True, - "margins_name": "TOTAL", - }, - ) + join_count = 7 if index is None and fill_value is None else 6 + union_count = 0 if index is None and fill_value is None else 1 + with SqlCounter(query_count=1, join_count=join_count, union_count=union_count): + pivot_table_test_helper( + df_data_with_nulls, + { + "index": index, + "columns": "C", + "values": "F", + "aggfunc": ["min", "max"], + "dropna": False, + "fill_value": fill_value, + "margins": True, + "margins_name": "TOTAL", + }, + ) @sql_count_checker(query_count=1, join_count=12, union_count=1) @@ -184,3 +243,142 @@ def test_pivot_table_unsupported_dropna_with_expanded_aggregation_margins_unsupp aggfunc={"E": ["min"], "F": "max"}, margins=True, ) + + +@pytest.mark.parametrize( + "columns", [["B"], ["B", "C"]], ids=["single_column", "multiple_columns"] +) +class TestPivotTableMarginsNoIndexFewerPivotValues: + @sql_count_checker(query_count=1, join_count=1) + def test_single_value_single_aggfunc(self, columns, df_data): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D"], + "aggfunc": "sum", + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=1, union_count=2) + def test_multiple_value_single_aggfunc(self, columns, df_data): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": ["sum"], + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=3) + def test_single_value_multiple_aggfunc(self, columns, df_data): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D"], + "aggfunc": ["sum", "min"], + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=5, union_count=2) + def test_multiple_value_multiple_aggfunc(self, columns, df_data): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": ["sum", "min"], + "dropna": True, + "margins": True, + }, + ) + + +@sql_count_checker(query_count=1) +def test_pivot_table_empty_table_with_index_margins(): + # Cannot use pivot_table_test_helper since that checks the inferred types + # on the resulting DataFrames' columns (which are empty), and the inferred type + # on our DataFrame's columns is empty, while pandas has type floating. + import pandas as native_pd + + native_df = native_pd.DataFrame({"A": [], "B": [], "C": [], "D": []}) + snow_df = pd.DataFrame(native_df) + pivot_kwargs = { + "index": ["A", "B"], + "columns": "C", + "values": "D", + "aggfunc": "count", + "margins": True, + } + + snow_result = snow_df.pivot_table(**pivot_kwargs).to_pandas() + native_result = native_df.pivot_table(**pivot_kwargs) + + assert native_result.empty == snow_result.empty and (native_result.empty is True) + assert list(native_result.columns) == list(snow_result.columns) + assert list(native_result.index) == list(snow_result.index) + + +@pytest.mark.parametrize( + "columns", [["B"], ["B", "C"]], ids=["single_column", "multiple_columns"] +) +class TestPivotTableMarginsNoIndexMorePivotValues: + @sql_count_checker(query_count=1, join_count=1) + def test_single_value_single_aggfunc(self, columns, df_data_more_pivot_values): + pivot_table_test_helper( + df_data_more_pivot_values, + { + "columns": columns, + "values": ["D"], + "aggfunc": ["sum"], + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=1, union_count=2) + def test_multiple_value_single_aggfunc(self, columns, df_data_more_pivot_values): + pivot_table_test_helper( + df_data_more_pivot_values, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": "sum", + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=3) + def test_single_value_multiple_aggfunc(self, columns, df_data_more_pivot_values): + pivot_table_test_helper( + df_data_more_pivot_values, + { + "columns": columns, + "values": ["D"], + "aggfunc": ["sum", "min"], + "dropna": True, + "margins": True, + }, + ) + + @sql_count_checker(query_count=1, join_count=5, union_count=2) + def test_multiple_value_multiple_aggfunc(self, columns, df_data_more_pivot_values): + pivot_table_test_helper( + df_data_more_pivot_values, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": ["sum", "min"], + "dropna": True, + "margins": True, + }, + ) diff --git a/tests/integ/modin/pivot/test_pivot_multiple.py b/tests/integ/modin/pivot/test_pivot_multiple.py index 0b1e295ccb2..c0e98d3900d 100644 --- a/tests/integ/modin/pivot/test_pivot_multiple.py +++ b/tests/integ/modin/pivot/test_pivot_multiple.py @@ -2,11 +2,15 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import modin.pandas as pd import numpy as np +import pandas as native_pd import pytest +import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.pivot.pivot_utils import pivot_table_test_helper from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import eval_snowpark_pandas_result @sql_count_checker(query_count=1, join_count=1) @@ -21,20 +25,120 @@ def test_pivot_table_single_index_single_column_multiple_values(df_data): ) -@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean"]) +@sql_count_checker(query_count=1, union_count=1) +def test_pivot_table_no_index_single_column_multiple_values(df_data): + pivot_table_test_helper( + df_data, + { + "columns": "B", + "values": ["D", "E"], + }, + ) + + +@sql_count_checker(query_count=1, union_count=1, join_count=2) +def test_pivot_table_no_index_single_column_multiple_values_multiple_aggr_func(df_data): + pivot_table_test_helper( + df_data, + { + "columns": "B", + "values": ["D", "E"], + "aggfunc": ["mean", "max"], + }, + ) + + +@sql_count_checker(query_count=1, union_count=1) +@pytest.mark.parametrize("columns", ["B", ["B", "C"]]) +def test_pivot_table_no_index_multiple_values_single_aggr_func_dict(df_data, columns): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": {"D": "mean", "E": "max"}, + }, + ) + + +# pandas moves the name of the aggfunc into the data columns as an index column. +@pytest.mark.xfail( + strict=True, + reason="SNOW-1435365 - look into no index + aggfunc as dictionary with list.", +) +@sql_count_checker(query_count=1, union_count=1) +@pytest.mark.parametrize("columns", ["B", ["B", "C"]]) +def test_pivot_table_no_index_column_multiple_values_multiple_aggr_func_dict( + df_data, columns +): + pivot_table_test_helper( + df_data, + { + "columns": columns, + "values": ["D", "E"], + "aggfunc": {"D": ["mean", "sum"], "E": "max"}, + }, + ) + + +@sql_count_checker(query_count=1, join_count=1) +def test_pivot_table_no_index_single_column_single_values_multiple_aggr_func(df_data): + pivot_table_test_helper( + df_data, + { + "columns": "B", + "values": "D", + "aggfunc": ["mean", "max"], + }, + ) + + +@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean", ["count"]]) +@pytest.mark.parametrize("values", ["D", ["D"]]) @sql_count_checker(query_count=1) -def test_pivot_table_single_index_multiple_column_single_value(df_data, aggfunc): +def test_pivot_table_single_index_multiple_column_single_value( + df_data, aggfunc, values +): pivot_table_test_helper( df_data, { "index": "A", "columns": ["B", "C"], - "values": "D", + "values": values, + "aggfunc": aggfunc, + }, + ) + + +@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean"]) +@pytest.mark.parametrize("values", ["D", ["D"]]) +@sql_count_checker(query_count=1) +def test_pivot_table_no_index_multiple_column_single_value(df_data, aggfunc, values): + pivot_table_test_helper( + df_data, + { + "columns": ["B", "C"], + "values": values, "aggfunc": aggfunc, }, ) +@pytest.mark.parametrize("values", ["D", ["D"]]) +@sql_count_checker(query_count=1, join_count=1) +def test_pivot_table_no_index_multiple_column_single_value_multiple_aggr_func( + df_data, values +): + pivot_table_test_helper( + df_data, + { + "columns": ["B", "C"], + "values": values, + "aggfunc": ["mean", "max"], + }, + ) + + @pytest.mark.skip( "SNOW-853416: Some lingering encoding issues and also unsorted order does not match" ) @@ -82,14 +186,43 @@ def test_pivot_table_single_index_single_column_multiple_encoded_values_with_sor ) +@pytest.mark.parametrize("aggfunc", ["count", ["count"]]) @sql_count_checker(query_count=1, join_count=1) -def test_pivot_table_single_index_multiple_columns_multiple_values(df_data): +def test_pivot_table_single_index_multiple_columns_multiple_values(df_data, aggfunc): pivot_table_test_helper( df_data, { "index": "A", "columns": ["B", "C"], "values": ["D", "E"], + "aggfunc": aggfunc, + }, + ) + + +@pytest.mark.parametrize("aggfunc", ["count", ["count"]]) +@sql_count_checker(query_count=1, union_count=1) +def test_pivot_table_no_index_multiple_columns_multiple_values(df_data, aggfunc): + pivot_table_test_helper( + df_data, + { + "columns": ["B", "C"], + "values": ["D", "E"], + "aggfunc": aggfunc, + }, + ) + + +@sql_count_checker(query_count=1, union_count=1, join_count=2) +def test_pivot_table_no_index_multiple_columns_multiple_values_multiple_aggr_funcs( + df_data, +): + pivot_table_test_helper( + df_data, + { + "columns": ["B", "C"], + "values": ["D", "E"], + "aggfunc": ["mean", "max"], }, ) @@ -119,6 +252,58 @@ def test_pivot_table_single_index_no_column_single_value_multiple_aggr_funcs(df_ ) +@sql_count_checker(query_count=0) +def test_pivot_table_no_index_no_column_single_value(df_data): + pivot_kwargs = { + "values": "D", + "aggfunc": "mean", + } + eval_snowpark_pandas_result( + pd.DataFrame(df_data), + native_pd.DataFrame(df_data), + lambda df: df.pivot_table(**pivot_kwargs), + assert_exception_equal=True, + expect_exception=True, + expect_exception_match="No group keys passed!", + expect_exception_type=ValueError, + ) + + +@sql_count_checker(query_count=0) +def test_pivot_table_no_index_no_column_single_value_multiple_aggr_funcs(df_data): + pivot_kwargs = { + "values": "D", + "aggfunc": ["mean", "max"], + } + eval_snowpark_pandas_result( + pd.DataFrame(df_data), + native_pd.DataFrame(df_data), + lambda df: df.pivot_table(**pivot_kwargs), + assert_exception_equal=True, + expect_exception=True, + expect_exception_match="No group keys passed!", + expect_exception_type=ValueError, + ) + + +@sql_count_checker(query_count=0, join_count=0) +def test_pivot_table_no_index_no_column_no_value_multiple_aggr_funcs(df_data): + pivot_kwargs = { + "columns": None, + "values": None, + "aggfunc": ["min", "max"], + } + eval_snowpark_pandas_result( + pd.DataFrame(df_data), + native_pd.DataFrame(df_data), + lambda df: df.pivot_table(**pivot_kwargs), + assert_exception_equal=True, + expect_exception=True, + expect_exception_match="No group keys passed!", + expect_exception_type=ValueError, + ) + + @pytest.mark.skip( "SNOW-854301: Multi-Index replaces None with Nan causing test to fail" ) @@ -147,7 +332,7 @@ def update_columns_inline(df): # TODO (SNOW-854301): Needs support for MultiIndex.levels, fails because result.columns.levels[N] don't equal # We use xfail to run so we can help code coverage -@pytest.mark.xfail +@pytest.mark.xfail(strict=True) @pytest.mark.parametrize("values", [None, []]) @sql_count_checker(query_count=0) def test_pivot_table_no_values_by_default(df_data, values): diff --git a/tests/integ/modin/pivot/test_pivot_negative.py b/tests/integ/modin/pivot/test_pivot_negative.py index 3751ae11522..a009e188f7a 100644 --- a/tests/integ/modin/pivot/test_pivot_negative.py +++ b/tests/integ/modin/pivot/test_pivot_negative.py @@ -77,23 +77,6 @@ def test_pivot_table_invalid_values_columns_not_supported(df_data, pivot_table_k ) -@sql_count_checker(query_count=0) -def test_pivot_table_no_index_no_column_single_value_raises_error(df_data): - pivot_table_test_helper_expects_exception( - df_data, - { - "index": None, - "columns": None, - "values": "D", - }, - # we currently throws NotImplementedError if no "index" configuration is provided. - # TODO (SNOW-959913): Enable support for no "index" configuration - expect_exception_type=NotImplementedError, - expect_exception_match="pivot_table with no index configuration is currently not supported", - assert_exception_equal=False, - ) - - @pytest.mark.parametrize( "aggfunc", [ @@ -178,15 +161,3 @@ def dummy_aggr_func(series): match="median", ): snow_df.pivot_table(index="A", columns="C", values="D", aggfunc="median") - - with pytest.raises( - NotImplementedError, - match="pivot_table with no index configuration is currently not supported", - ): - snow_df.pivot_table(index=None, columns="C", values="D") - - with pytest.raises( - NotImplementedError, - match="pivot_table with no index configuration is currently not supported", - ): - snow_df.pivot_table(index=None, columns=None, values="D") diff --git a/tests/integ/modin/pivot/test_pivot_single.py b/tests/integ/modin/pivot/test_pivot_single.py index 1147d55ca7d..4887b2e9a98 100644 --- a/tests/integ/modin/pivot/test_pivot_single.py +++ b/tests/integ/modin/pivot/test_pivot_single.py @@ -17,9 +17,6 @@ from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result -@pytest.mark.skip( - "SNOW-959913: Support no index configuration with columns and margins configuration" -) @sql_count_checker(query_count=1) def test_pivot_table_no_index_single_column_single_value(df_data): pivot_table_test_helper( @@ -73,6 +70,48 @@ def test_pivot_table_multi_index_single_column_single_value(df_data, aggfunc): ) +@pytest.mark.parametrize( + "aggfunc", + [ + "count", + "sum", + "min", + "max", + "mean", + ], +) +@sql_count_checker(query_count=1) +def test_pivot_table_no_index(df_data, aggfunc): + pivot_table_test_helper( + df_data, + {"columns": "C", "values": "D", "aggfunc": aggfunc}, + ) + + +@sql_count_checker(query_count=1) +def test_pivot_table_empty_table_with_index(): + # Cannot use pivot_table_test_helper since that checks the inferred types + # on the resulting DataFrames' columns (which are empty), and the inferred type + # on our DataFrame's columns is empty, while pandas has type floating. + import pandas as native_pd + + native_df = native_pd.DataFrame({"A": [], "B": [], "C": [], "D": []}) + snow_df = pd.DataFrame(native_df) + pivot_kwargs = { + "index": ["A", "B"], + "columns": "C", + "values": "D", + "aggfunc": "count", + } + + snow_result = snow_df.pivot_table(**pivot_kwargs).to_pandas() + native_result = native_df.pivot_table(**pivot_kwargs) + + assert native_result.empty == snow_result.empty and (native_result.empty is True) + assert list(native_result.columns) == list(snow_result.columns) + assert list(native_result.index) == list(snow_result.index) + + @sql_count_checker(query_count=1) def test_pivot_table_single_index_no_column_single_value(df_data): pivot_table_test_helper( @@ -106,9 +145,9 @@ def test_pivot_table_no_index_no_column_single_value(df_data): "columns": None, "values": "D", }, - expect_exception_match="pivot_table with no index configuration is currently not supported", - expect_exception_type=NotImplementedError, - assert_exception_equal=False, + expect_exception_match=r"No group keys passed\!", + expect_exception_type=ValueError, + assert_exception_equal=True, ) @@ -201,8 +240,8 @@ def test_pivot_on_inline_data_using_temp_table(): assert row_count == 25 -@pytest.mark.xfail(strict=True, raises=SnowparkSQLException, reason="SNOW-1233895") -def test_pivot_empty_frame_snow_1233895(): +@pytest.mark.xfail(strict=True, raises=SnowparkSQLException, reason="SNOW-1013918") +def test_pivot_empty_frame_snow_1013918(): eval_snowpark_pandas_result( *create_test_dfs(columns=["a", "b", "c"]), lambda df: df.pivot_table(index="a", columns="b")