snowflakedb · sfc-gh-rdurrani · Jun 25, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,6 @@
 #### New Features
 
 - Added support for `to_boolean` function.
-- Added documentation pages for `Index` and its APIs.
 
 #### Bug Fixes
 
@@ -69,6 +68,8 @@
 - Added support for `replace` and `frac > 1` in `DataFrame.sample` and `Series.sample`.
 - Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`.
 - Added support for `Series.dt.isocalendar`.
+- Added documentation pages for `Index` and its APIs.
+- Added support for `DataFrame.assign`.
 
 #### Bug Fixes
 

@@ -53,6 +53,7 @@ DataFrame
 .. autosummary::
     :toctree: pandas_api/
 
+    DataFrame.assign
     DataFrame.head
     DataFrame.loc
     DataFrame.iloc

@@ -95,7 +95,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``asof``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``assign``                  | N                               |                                  |                                                    |
+| ``assign``                  | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``astype``                  | P                               |                                  | ``N``: from string to datetime or ``errors ==      |
 |                             |                                 |                                  | "ignore"``                                         |

@@ -666,7 +666,6 @@ def add(
             fill_value=fill_value,
         )
 
-    @dataframe_not_implemented()
     def assign(self, **kwargs):  # noqa: PR01, RT01, D200
         """
         Assign new columns to a ``DataFrame``.

@@ -868,6 +868,81 @@ def apply():
         dtype: float64
         """
 
+    def assign():
+        """
+        Assign new columns to a ``DataFrame``.
+
+        Returns a new object with all original columns in addition to new ones. Existing
+        columns that are re-assigned will be overwritten.
+
+        Parameters
+        ----------
+        **kwargs: dict of {str: callable or Series}
+            The column names are the keywords. If the values are callable, they are computed
+            on the DataFrame and assigned to the new columns. The callable must not change input
+            DataFrame (though Snowpark pandas doesn't check it). If the values are not callable,
+            (e.g. a Series, scalar, or array), they are simply assigned.
+
+        Returns
+        -------
+        DataFrame
+            A new DataFrame with the new columns in addition to all the existing columns.
+
+        Notes
+        -----
+        - Assigning multiple columns within the same assign is possible. Later items in `**kwargs`
+          may refer to newly created or modified columns in `df`; items are computed and assigned into `df` in order.
+
+        - If an array that of the wrong length is passed in to assign, Snowpark pandas will either truncate the array, if it is too long,
+          or broadcast the last element of the array until the array is the correct length if it is too short. This differs from native pandas,
+          which will error out with a ValueError if the length of the array does not match the length of `df`.
+          This is done to preserve Snowpark pandas' lazy evaluation paradigm.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
+        ...                   index=['Portland', 'Berkeley'])
+        >>> df
+                temp_c
+        Portland    17.0
+        Berkeley    25.0
+
+        >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
+                temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
+                temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
+                      temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
+                temp_c  temp_f  temp_k
+        Portland    17.0    62.6  290.15
+        Berkeley    25.0    77.0  298.15
+
+        >>> df = pd.DataFrame({'col1': [17.0, 25.0, 22.0]})
+        >>> df
+           col1
+        0  17.0
+        1  25.0
+        2  22.0
+
+        >>> df.assign(new_col=[10, 11])
+           col1  new_col
+        0  17.0       10
+        1  25.0       11
+        2  22.0       11
+
+        >>> df.assign(new_col=[10, 11, 12, 13, 14])
+           col1  new_col
+        0  17.0       10
+        1  25.0       11
+        2  22.0       12
+        """
+
     def groupby():
         """
         Group DataFrame using a mapper or by a Series of columns.
@@ -1110,11 +1185,6 @@ def add():
         Get addition of ``DataFrame`` and `other`, element-wise (binary operator `add`).
         """
 
-    def assign():
-        """
-        Assign new columns to a ``DataFrame``.
-        """
-
     def boxplot():
         """
         Make a box plot from ``DataFrame`` columns.

@@ -0,0 +1,240 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+
+import re
+
+import modin.pandas as pd
+import pandas as native_pd
+import pytest
+
+import snowflake.snowpark.modin.plugin  # noqa: F401
+from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker
+from tests.integ.modin.utils import (
+    assert_snowpark_pandas_equals_to_pandas_without_dtypecheck,
+    create_test_dfs,
+    eval_snowpark_pandas_result,
+)
+
+
+@sql_count_checker(query_count=7, join_count=1)
+def test_assign_basic_series():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+
+    def assign_func(df):
+        if isinstance(df, pd.DataFrame):
+            return df.assign(new_col=pd.Series([10, 11, 12]))
+        else:
+            return df.assign(new_col=native_pd.Series([10, 11, 12]))
+
+    eval_snowpark_pandas_result(snow_df, native_df, assign_func)
+
+
+@sql_count_checker(query_count=7, join_count=1)
+@pytest.mark.parametrize(
+    "index", [[2, 1, 0], [4, 5, 6]], ids=["reversed_index", "different_index"]
+)
+def test_assign_basic_series_mismatched_index(index):
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+
+    def assign_func(df):
+        if isinstance(df, pd.DataFrame):
+            return df.assign(new_col=pd.Series([10, 11, 12], index=index))
+        else:
+            return df.assign(new_col=native_pd.Series([10, 11, 12], index=index))
+
+    eval_snowpark_pandas_result(snow_df, native_df, assign_func)
+
+
+@pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"])
+def test_assign_basic_non_pandas_object(new_col_value):
+    join_count = 2 if isinstance(new_col_value, list) else 0
+    with SqlCounter(query_count=7, join_count=join_count):
+        snow_df, native_df = create_test_dfs(
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            columns=pd.Index(list("abc"), name="columns"),
+            index=pd.Index([0, 1, 2], name="index"),
+        )
+        native_df.columns.names = ["columns"]
+        native_df.index.names = ["index"]
+        eval_snowpark_pandas_result(
+            snow_df, native_df, lambda df: df.assign(new_column=new_col_value)
+        )
+
+
+@sql_count_checker(query_count=7, join_count=2)
+def test_assign_invalid_long_column_length_negative():
+    # pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3.
+    # Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted
+    # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame in order to confirm lengths are correct
+    # and error otherwise.
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Length of values (5) does not match length of index (3)"),
+    ):
+        native_df = native_df.assign(new_column=[10, 11, 12, 13, 14])
+
+    snow_df = snow_df.assign(new_column=[10, 11, 12, 13, 14])
+    native_df = native_df.assign(new_column=[10, 11, 12])
+    assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
+
+
+@sql_count_checker(query_count=7, join_count=2)
+def test_assign_invalid_short_column_length_negative():
+    # pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3.
+    # Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted
+    # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame in order to confirm lengths are correct
+    # and error otherwise.
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    with pytest.raises(
+        ValueError,
+        match=re.escape("Length of values (2) does not match length of index (3)"),
+    ):
+        native_df = native_df.assign(new_column=[10, 11])
+
+    snow_df = snow_df.assign(new_column=[10, 11])
+    native_df = native_df.assign(new_column=[10, 11, 11])
+    assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
+
+
+@sql_count_checker(query_count=7, join_count=1)
+def test_assign_short_series():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    snow_df = snow_df.assign(new_column=pd.Series([10, 11]))
+    native_df = native_df.assign(new_column=native_pd.Series([10, 11]))
+    assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
+
+
+@sql_count_checker(query_count=7, join_count=1)
+@pytest.mark.parametrize(
+    "index", [[1, 0], [4, 5]], ids=["reversed_index", "different_index"]
+)
+def test_assign_short_series_mismatched_index(index):
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    snow_df = snow_df.assign(new_column=pd.Series([10, 11], index=index))
+    native_df = native_df.assign(new_column=native_pd.Series([10, 11], index=index))
+    assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
+
+
+@sql_count_checker(query_count=7)
+@pytest.mark.parametrize(
+    "callable_fn",
+    [lambda x: x["a"], lambda x: x["a"] + x["b"]],
+    ids=["identity_fn", "add_two_cols_fn"],
+)
+def test_assign_basic_callable(callable_fn):
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    eval_snowpark_pandas_result(
+        snow_df, native_df, lambda df: df.assign(new_col=callable_fn)
+    )
+
+
+@sql_count_checker(query_count=7)
+def test_assign_chained_callable():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        lambda df: df.assign(
+            new_col=lambda x: x["a"] + x["b"], last_col=lambda x: x["new_col"] ** 2
+        ),
+    )
+
+
+@sql_count_checker(query_count=6)
+def test_assign_chained_callable_wrong_order():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        lambda df: df.assign(
+            last_col=lambda x: x["new_col"] ** 2, new_col=lambda x: x["a"] + x["b"]
+        ),
+        expect_exception=True,
+        assert_exception_equal=True,
+        expect_exception_match="new_col",
+        expect_exception_type=KeyError,
+    )
+
+
+@sql_count_checker(query_count=7)
+def test_assign_self_columns():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    eval_snowpark_pandas_result(
+        snow_df, native_df, lambda df: df.assign(new_col=df["a"], last_col=df["b"])
+    )
+
+
+@sql_count_checker(query_count=7, join_count=2)
+def test_overwrite_columns_via_assign():
+    snow_df, native_df = create_test_dfs(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        columns=pd.Index(list("abc"), name="columns"),
+        index=pd.Index([0, 1, 2], name="index"),
+    )
+    native_df.columns.names = ["columns"]
+    native_df.index.names = ["index"]
+    eval_snowpark_pandas_result(
+        snow_df, native_df, lambda df: df.assign(a=df["b"], last_col=[10, 11, 12])
+    )