Merge branch 'main' into rdurrani-SNOW-1445866

snowflakedb · Jun 25, 2024 · e250751 · e250751
2 parents f16212c + 983d65c
commit e250751
Show file tree

Hide file tree

Showing 14 changed files with 556 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -68,6 +68,7 @@
 - Added support for `replace` and `frac > 1` in `DataFrame.sample` and `Series.sample`.
 - Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`.
 - Added support for `Series.dt.isocalendar`.
+- Added support for `Series.case_when` except when condition or replacement is callable.
 - Added documentation pages for `Index` and its APIs.
 - Added support for `DataFrame.assign`.
 
@@ -85,10 +86,10 @@
 - `pd.read_csv` reads using the native pandas CSV parser, then uploads data to snowflake using parquet. This enables most of the parameters supported by `read_csv` including date parsing and numeric conversions. Uploading via parquet is roughly twice as fast as uploading via CSV.
 - Initial work to support an Index directly in Snowpark pandas. Support for Index as a first-class component of Snowpark pandas is coming soon.
 - Added lazy index constructor and support for `len`, `shape`, `size`, `empty`, `to_pandas()` and `names`. 
-- Added support for `Index.copy()`
 - For `df.index`, Snowpark pandas creates a lazy index object. 
 - For `df.columns`, Snowpark pandas supports a non-lazy version of an Index since the data is already stored locally
-- Initial work to support an Index directly in Snowpark pandas. Currently, this class is a simple wrapper for a pandas index. Support for Index as a first-class component of Snowpark pandas is coming soon.
+- Added support for `Index.copy()`
+- Added support for Index APIs: `values`, `item()`, `tolist()`, `to_series()` and `to_frame()`
 - Expand support for DataFrames with no rows in `pd.pivot_table` and `DataFrame.pivot_table`.
 
 ## 1.18.0 (2024-05-28)

diff --git a/docs/source/modin/indexing.rst b/docs/source/modin/indexing.rst
@@ -63,6 +63,7 @@ Index
     Index.is_interval
     Index.is_numeric
     Index.is_object
+    Index.item
     Index.min
     Index.max
     Index.reindex

diff --git a/docs/source/modin/series.rst b/docs/source/modin/series.rst
@@ -151,6 +151,7 @@ Series
 .. autosummary::
     :toctree: pandas_api/
 
+    Series.case_when
     Series.drop
     Series.drop_duplicates
     Series.get

diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -15,7 +15,7 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | Index attribute             | Snowpark implemented? (Y/N/P/D) | Notes for current implementation                   |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``values``                  | P                               |                                                    |
+| ``values``                  | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``is_monotonic_increasing`` | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
@@ -140,17 +140,17 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``astype``                  | P                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``item``                    | N                               |                                  |                                                    |
+| ``item``                    | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``map``                     | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``ravel``                   | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``to_list``                 | P                               |                                  |                                                    |
+| ``to_list``                 | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``to_series``               | N                               |                                  |                                                    |
+| ``to_series``               | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``to_frame``                | N                               |                                  |                                                    |
+| ``to_frame``                | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``view``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst
@@ -123,7 +123,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``bool``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``case_when``               | N                               |                                  |                                                    |
+| ``case_when``               | P                               |                                  | ``N`` if condition or replacement is a callable.   |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``clip``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py
@@ -1108,7 +1108,6 @@ def factorize(
             use_na_sentinel=use_na_sentinel,
         )
 
-    @series_not_implemented()
     def case_when(self, caselist) -> Series:  # noqa: PR01, RT01, D200
         """
         Replace values where the conditions are True.

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/index.py b/src/snowflake/snowpark/modin/plugin/_internal/index.py
@@ -136,7 +136,7 @@ def set_query_compiler(
                     tupleize_cols=tupleize_cols,
                 )
             )._query_compiler
-        self._query_compiler = qc
+        self._query_compiler = qc.drop(columns=qc.columns)
 
     def set_local_index(
         self,
@@ -208,6 +208,19 @@ def check_lazy(*args: Any, **kwargs: Any) -> Any:
                         Index(returned_value[0], convert_to_lazy=False),
                         returned_value[1],
                     )
+                # For methods that return a series, convert this series to snowpark pandas
+                # an example is to_series
+                elif isinstance(returned_value, native_pd.Series):
+                    from snowflake.snowpark.modin.pandas import Series
+
+                    returned_value = Series(returned_value)
+
+                # for methods that return a dataframe, convert this dataframe to snowpark pandas
+                elif isinstance(returned_value, native_pd.DataFrame):
+                    from snowflake.snowpark.modin.pandas import DataFrame
+
+                    returned_value = DataFrame(returned_value)
+
                 return returned_value
 
         return check_lazy
@@ -257,7 +270,6 @@ def to_pandas(self) -> native_pd.Index:
         return self._index
 
     @property
-    @is_lazy_check
     def values(self) -> ArrayLike:
         """
         Return an array representing the data in the Index.
@@ -281,7 +293,6 @@ def values(self) -> ArrayLike:
         >>> idx.values
         array([1, 2, 3])
         """
-        # TODO: SNOW-1458117 implement values
         return self.to_pandas().values
 
     @property
@@ -1478,8 +1489,8 @@ def value_counts(
             dropna=dropna,
         )
 
-    @index_not_implemented()
-    def item(self) -> None:
+    @is_lazy_check
+    def item(self) -> Hashable:
         """
         Return the first element of the underlying data as a Python scalar.
 
@@ -1493,10 +1504,25 @@ def item(self) -> None:
         ValueError
             If the data is not length = 1.
         """
-        # TODO: SNOW-1458117 implement item
+        # slice the first two elements of the index and materialize them
+        item = self._query_compiler.take_2d_positional(
+            index=slice(2), columns=[]
+        ).index.to_pandas()
 
-    @index_not_implemented()
-    def to_series(self) -> None:
+        # return the element as a scalar if the index is exacly one element large
+        if len(item) == 1:
+            return item[0]
+
+        # otherwise raise the same value error as pandas
+        raise ValueError("can only convert an array of size 1 to a Python scalar")
+
+    @is_lazy_check
+    def to_series(
+        self,
+        index: Index | None = None,
+        name: Hashable | None = None
+        # TODO: SNOW-1481037 : Fix typehints
+    ) -> Any:
         """
         Create a Series with both index and values equal to the index keys.
 
@@ -1520,10 +1546,28 @@ def to_series(self) -> None:
         Index.to_frame : Convert an Index to a DataFrame.
         Series.to_frame : Convert Series to DataFrame.
         """
-        # TODO: SNOW-1458117 implement to_series
+        from snowflake.snowpark.modin.pandas import Series
 
-    @index_not_implemented()
-    def to_frame(self) -> None:
+        # get the index name if the name is not given
+        if name is None:
+            name = self.name
+
+        # convert self to a dataframe and get qc
+        # this will give us a df where the index and data columns both have self
+        new_qc = self.to_frame(name=name)._query_compiler
+
+        # if we are given an index, join this index column into qc
+        if index is not None:
+            new_qc = new_qc.set_index_from_series(Series(index)._query_compiler)
+
+        # create series and set the name
+        ser = Series(query_compiler=new_qc)
+        ser.name = name
+        return ser
+
+    @is_lazy_check
+    # TODO: SNOW-1481037 : Fix typehints
+    def to_frame(self, index: bool = True, name: Hashable | None = None) -> Any:
         """
         Create a DataFrame with a column containing the Index.
 
@@ -1546,7 +1590,34 @@ def to_frame(self) -> None:
         Index.to_series : Convert an Index to a Series.
         Series.to_frame : Convert Series to DataFrame.
         """
-        # TODO: SNOW-1458117 implement to_frame
+        from snowflake.snowpark.modin.pandas import DataFrame
+
+        # Do a reset index to convert the index column to a data column,
+        # the index column becomes the pandas default index of row position
+        # Example:
+        # before
+        # index columns:    data columns (empty):
+        #      100
+        #      200
+        #      300
+        # after
+        # index columns:    data columns (name=column_name):
+        #       0               100
+        #       1               200
+        #       2               300
+        # if index is true, we want self to be in the index and data columns of the df,
+        # so set the index as the data column and set the name of the index
+        if index:
+            new_qc = self._query_compiler.reset_index()
+            new_qc = (
+                new_qc.set_index([new_qc.columns[0]], drop=False)
+                .set_columns([name])
+                .set_index_names([self.name])
+            )
+        else:
+            new_qc = self._query_compiler.reset_index(names=[name])
+
+        return DataFrame(query_compiler=new_qc)
 
     @index_not_implemented()
     def fillna(self) -> None:
@@ -1661,7 +1732,6 @@ def hasnans(self) -> None:
         """
         # TODO: SNOW-1458139 implement hasnans
 
-    @is_lazy_check
     def tolist(self) -> list:
         """
         Return a list of the values.
@@ -1688,7 +1758,6 @@ def tolist(self) -> list:
         >>> idx.to_list()
         [1, 2, 3]
         """
-        # TODO: SNOW-1458117 implement tolist
         return self.to_pandas().tolist()
 
     to_list = tolist

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py
@@ -776,8 +776,8 @@ def _extract_loc_set_col_info(
             label for label in columns if label not in frame_data_columns
         ]
         columns = [label for label in columns if label in frame_data_columns]
-        before = frame_data_columns.value_counts()
-        after = union_data_columns.value_counts()
+        before = frame_data_columns.to_pandas().value_counts()
+        after = union_data_columns.to_pandas().value_counts()
         frame_data_col_labels = frame_data_columns.tolist()
         for label in after.index:
             if label in frame_data_columns: