Merge branch 'main' into jrose_snow_1651234_structured_create_dataframe

snowflakedb · Sep 11, 2024 · bfea23c · bfea23c
2 parents aff450e + ee8df55
commit bfea23c
Show file tree

Hide file tree

Showing 21 changed files with 159 additions and 393 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,8 +48,6 @@
 - Fixed a bug in `session.get_session_stage` that referenced a non-existing stage after switching database or schema.
 - Fixed a bug where calling `DataFrame.to_snowpark_pandas_dataframe` without explicitly initializing the Snowpark pandas plugin caused an error.
 - Fixed a bug where using the `explode` function in dynamic table creation caused a SQL compilation error due to improper boolean type casting on the `outer` parameter.
-- Fixed a bug where using `to_pandas_batches` with async jobs caused an error due to improper handling of waiting for asynchronous query completion.
-- Fixed a bug in `session.create_dataframe` that caused a sql error when creating an iceberg table with structured datatypes.
 
 ### Snowpark Local Testing Updates
 
@@ -133,6 +131,14 @@
 - When calling `DataFrame.set_index`, or setting `DataFrame.index` or `Series.index`, with a new index that does not match the current length of the `Series`/`DataFrame` object, a `ValueError` is no longer raised. When the `Series`/`DataFrame` object is longer than the new index, the `Series`/`DataFrame`'s new index is filled with `NaN` values for the "extra" elements. When the `Series`/`DataFrame` object is shorter than the new index, the extra values in the new index are ignored—`Series` and `DataFrame` stay the same length `n`, and use only the first `n` values of the new index.
 
 
+## 1.21.1 (2024-09-05)
+
+### Snowpark Python API Updates
+
+#### Bug Fixes
+
+- Fixed a bug where using `to_pandas_batches` with async jobs caused an error due to improper handling of waiting for asynchronous query completion.
+
 ## 1.21.0 (2024-08-19)
 
 ### Snowpark Python API Updates

diff --git a/docs/source/modin/indexing.rst b/docs/source/modin/indexing.rst
@@ -24,7 +24,6 @@ Index
     Index.is_monotonic_decreasing
     Index.is_unique
     Index.has_duplicates
-    Index.hasnans
     Index.dtype
     Index.shape
     Index.name
@@ -34,6 +33,8 @@ Index
     Index.empty
     Index.T
     Index.nlevels
+    Index.array
+    Index.str
 
 .. rubric:: Snowflake Specific
 
@@ -52,17 +53,11 @@ Index
     Index.argmin
     Index.argmax
     Index.copy
-    Index.delete
-    Index.drop
-    Index.drop_duplicates
-    Index.duplicated
     Index.equals
     Index.identical
-    Index.insert
     Index.is_boolean
     Index.is_floating
     Index.is_integer
-    Index.is_interval
     Index.is_numeric
     Index.is_object
     Index.item
@@ -73,8 +68,6 @@ Index
     Index.unique
     Index.nunique
     Index.value_counts
-    Index.array
-    Index.str
 
 .. rubric:: Compatibility with MultiIndex
 
@@ -83,17 +76,6 @@ Index
 
     Index.set_names
 
-.. rubric:: Missing values
-
-.. autosummary::
-    :toctree: pandas_api/
-
-    Index.fillna
-    Index.dropna
-    Index.isna
-    Index.notna
-
-
 .. rubric:: Conversion
 
 .. autosummary::
@@ -113,27 +95,6 @@ Index
 
     Index.sort_values
 
-.. rubric:: Combining / joining / set operations
-
-.. autosummary::
-    :toctree: pandas_api/
-
-    Index.append
-    Index.join
-    Index.intersection
-    Index.union
-    Index.difference
-
-.. rubric:: Selecting
-
-.. autosummary::
-    :toctree: pandas_api/
-
-    Index.get_indexer_for
-    Index.get_level_values
-    Index.isin
-    Index.slice_indexer
-
 .. _api.datetimeindex:
 
 DatetimeIndex

diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -54,9 +54,9 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``nlevels``                 | P                               | Only single Index supported.                       |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``array``                   | N                               |                                                    |
+| ``array``                   | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``str``                     | D                               |                                                    |
+| ``str``                     | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 
 

diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "snowflake-snowpark-python" %}
-{% set version = "1.21.0" %}
+{% set version = "1.21.1" %}
 
 package:
   name: {{ name|lower }}

diff --git a/src/snowflake/snowpark/modin/pandas/__init__.py b/src/snowflake/snowpark/modin/pandas/__init__.py
@@ -88,14 +88,13 @@
 
 # TODO: SNOW-851745 make sure add all Snowpark pandas API general functions
 from modin.pandas import plotting  # type: ignore[import]
-from modin.pandas.base import BasePandasDataset
-from modin.pandas.series import _SERIES_EXTENSIONS_, Series
+from modin.pandas.series import Series
 
 from snowflake.snowpark.modin.pandas.api.extensions import (
     register_dataframe_accessor,
     register_series_accessor,
 )
-from snowflake.snowpark.modin.pandas.dataframe import _DATAFRAME_EXTENSIONS_, DataFrame
+from snowflake.snowpark.modin.pandas.dataframe import DataFrame
 from snowflake.snowpark.modin.pandas.general import (
     bdate_range,
     concat,
@@ -149,6 +148,7 @@
 )
 from snowflake.snowpark.modin.plugin._internal.session import SnowpandasSessionHolder
 from snowflake.snowpark.modin.plugin._internal.telemetry import (
+    TELEMETRY_PRIVATE_METHODS,
     try_add_telemetry_to_attribute,
 )
 from snowflake.snowpark.modin.plugin.utils.frontend_constants import _ATTRS_NO_LOOKUP
@@ -166,24 +166,6 @@
     read_json,
 )
 
-# Record which attributes are defined on an upstream object, and which are defined on a vendored
-# object (currently just dataframe.py), and determine when adding telemetry is necessary.
-# This must be checked before overrides are applied.
-_attrs_defined_on_modin_series = set()
-for attr_name, attr_value in Series.__dict__.items():
-    base_value = BasePandasDataset.__dict__.get(attr_name, None)
-    if base_value is None or attr_value != base_value:
-        _attrs_defined_on_modin_series.add(attr_name)
-
-_attrs_defined_on_dataframe = (
-    set()
-)  # TODO: SNOW-1063346 revisit when dataframe.py is removed
-for attr_name, attr_value in DataFrame.__dict__.items():
-    base_value = BasePandasDataset.__dict__.get(attr_name, None)
-    if base_value is None or attr_value != base_value:
-        _attrs_defined_on_dataframe.add(attr_name)
-
-
 # base overrides occur before subclass overrides in case subclasses override a base method
 import snowflake.snowpark.modin.plugin.extensions.base_extensions  # isort: skip  # noqa: E402,F401
 import snowflake.snowpark.modin.plugin.extensions.base_overrides  # isort: skip  # noqa: E402,F401
@@ -203,48 +185,27 @@
 modin.pandas.base._ATTRS_NO_LOOKUP.update(_ATTRS_NO_LOOKUP)
 
 
-# For any method defined on Series/DF, add telemetry to it if it meets all of the following conditions:
-# 1. The method was defined directly on an upstream class (_attrs_defined_on_modin_base, _attrs_defined_on_modin_series)
-#   1a. (DataFrame only): The method is not overridden by DataFrame (not applicable to Series, since we use the upstream version)
-# 2. The method is not overridden by an extensions module
-# 3. The method name does not start with an _
-_base_telemetry_added_attrs = set()
+# For any method defined on Series/DF, add telemetry to it if it:
+# 1. Is defined directly on an upstream class
+# 2. The method name does not start with an _, or is in TELEMETRY_PRIVATE_METHODS
 
-_series_ext = _SERIES_EXTENSIONS_.copy()
 for attr_name in dir(Series):
     # Since Series is defined in upstream Modin, all of its members were either defined upstream
     # or overridden by extension.
-    if attr_name not in _series_ext and not attr_name.startswith("_"):
+    if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS:
         register_series_accessor(attr_name)(
             try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name))
         )
-        if attr_name not in _attrs_defined_on_modin_series:
-            # attribute was defined on BasePandasDataset and inherited, so don't override it again
-            # for DataFrame
-            _base_telemetry_added_attrs.add(attr_name)
 
 
 # TODO: SNOW-1063346
 # Since we still use the vendored version of DataFrame and the overrides for the top-level
 # namespace haven't been performed yet, we need to set properties on the vendored version
-_dataframe_ext = _DATAFRAME_EXTENSIONS_.copy()
 for attr_name in dir(DataFrame):
-    if (
-        attr_name not in _attrs_defined_on_dataframe
-        and attr_name not in _dataframe_ext
-        and not attr_name.startswith("_")
-    ):
-        # If this method was inherited from BasePandasDataset and telemetry was already added via
-        # Series, register the override but don't re-wrap the method in the telemetry annotation.
-        # If we don't do this check, we will end up double-reporting telemetry on some methods.
-        original_attr = getattr(DataFrame, attr_name)
-        new_attr = (
-            original_attr
-            if attr_name in _base_telemetry_added_attrs
-            else try_add_telemetry_to_attribute(attr_name, original_attr)
+    if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS:
+        register_dataframe_accessor(attr_name)(
+            try_add_telemetry_to_attribute(attr_name, getattr(DataFrame, attr_name))
         )
-        register_dataframe_accessor(attr_name)(new_attr)
-        _base_telemetry_added_attrs.add(attr_name)
 
 
 def __getattr__(name: str) -> Any:

diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py
@@ -91,7 +91,6 @@
     replace_external_data_keys_with_empty_pandas_series,
     replace_external_data_keys_with_query_compiler,
 )
-from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
 from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated
 from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
 from snowflake.snowpark.modin.plugin.utils.error_message import (
@@ -138,7 +137,7 @@
     ],
     apilink="pandas.DataFrame",
 )
-class DataFrame(BasePandasDataset, metaclass=TelemetryMeta):
+class DataFrame(BasePandasDataset):
     _pandas_class = pandas.DataFrame
 
     def __init__(
@@ -2363,7 +2362,7 @@ def set_index(
         # this needs to pull all index which is inefficient
         if verify_integrity and not new_query_compiler.index.is_unique:
             duplicates = new_query_compiler.index[
-                new_query_compiler.index.duplicated()
+                new_query_compiler.index.to_pandas().duplicated()
             ].unique()
             raise ValueError(f"Index has duplicate keys: {duplicates}")
 

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/resample_utils.py
@@ -2,7 +2,6 @@
 # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
 #
 
-import math
 from typing import Any, Literal, NoReturn, Optional, Union
 
 import pandas as native_pd
@@ -18,7 +17,6 @@
     dateadd,
     datediff,
     lit,
-    row_number,
     to_timestamp_ntz,
 )
 from snowflake.snowpark.modin.plugin._internal import join_utils
@@ -28,16 +26,8 @@
     MatchComparator,
     join,
 )
-from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import (
-    DataFrameReference,
-    OrderedDataFrame,
-)
-from snowflake.snowpark.modin.plugin._internal.utils import (
-    generate_snowflake_quoted_identifiers_helper,
-)
 from snowflake.snowpark.modin.plugin.utils.error_message import ErrorMessage
 from snowflake.snowpark.types import DateType, TimestampType
-from snowflake.snowpark.window import Window
 
 RESAMPLE_INDEX_LABEL = "__resample_index__"
 
@@ -478,39 +468,15 @@ def get_expected_resample_bins_frame(
     2020-01-07
     2020-01-09
     """
-    slice_width, slice_unit = rule_to_snowflake_width_and_slice_unit(rule)
-
-    index_column_snowflake_quoted_identifiers = (
-        generate_snowflake_quoted_identifiers_helper(
-            pandas_labels=[RESAMPLE_INDEX_LABEL]
-        )
-    )
-
-    # row_number ensures there are no gaps in the sequence.
-    all_resample_bins_col = dateadd(
-        slice_unit,
-        (row_number().over(Window.order_by(lit(1))) - 1) * slice_width,
-        to_timestamp_ntz(lit(start_date)),
-    ).as_(index_column_snowflake_quoted_identifiers[0])
-
-    rowcount = math.floor(
-        (native_pd.to_datetime(end_date) - native_pd.to_datetime(start_date))
-        / to_offset(rule)
-        + 1
-    )
-
-    expected_resample_bins_snowpark_frame = pd.session.generator(
-        all_resample_bins_col, rowcount=rowcount
-    )
-
+    expected_resample_bins_snowpark_frame = pd.date_range(
+        start_date, end_date, freq=rule
+    )._query_compiler._modin_frame
     return InternalFrame.create(
-        ordered_dataframe=OrderedDataFrame(
-            DataFrameReference(expected_resample_bins_snowpark_frame)
-        ),
+        ordered_dataframe=expected_resample_bins_snowpark_frame.ordered_dataframe,
         data_column_pandas_labels=[],
         data_column_snowflake_quoted_identifiers=[],
         index_column_pandas_labels=[RESAMPLE_INDEX_LABEL],
-        index_column_snowflake_quoted_identifiers=index_column_snowflake_quoted_identifiers,
+        index_column_snowflake_quoted_identifiers=expected_resample_bins_snowpark_frame.index_column_snowflake_quoted_identifiers,
         data_column_pandas_index_names=[None],
         data_column_types=None,
         index_column_types=None,

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py
@@ -9,7 +9,6 @@
 from enum import Enum, unique
 from typing import Any, Callable, Optional, TypeVar, Union, cast
 
-import modin.pandas
 from typing_extensions import ParamSpec
 
 import snowflake.snowpark.session
@@ -543,8 +542,6 @@ class TelemetryMeta(type):
     def __new__(
         cls, name: str, bases: tuple, attrs: dict[str, Any]
     ) -> Union[
-        "modin.pandas.Series",
-        "snowflake.snowpark.modin.pandas.dataframe.DataFrame",
         "snowflake.snowpark.modin.pandas.groupby.DataFrameGroupBy",
         "snowflake.snowpark.modin.pandas.resample.Resampler",
         "snowflake.snowpark.modin.pandas.window.Window",
@@ -558,7 +555,6 @@ def __new__(
         with ``snowpark_pandas_telemetry_api_usage`` telemetry decorator.
         Method arguments returned by _get_kwargs_telemetry are collected otherwise set telemetry_args=list().
         TelemetryMeta is only set as the metaclass of:
-         snowflake.snowpark.modin.pandas.dataframe.DataFrame,
          snowflake.snowpark.modin.pandas.groupby.DataFrameGroupBy,
          snowflake.snowpark.modin.pandas.resample.Resampler,
          snowflake.snowpark.modin.pandas.window.Window,

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py b/src/snowflake/snowpark/modin/plugin/extensions/base_extensions.py
@@ -6,15 +6,10 @@
 File containing BasePandasDataset APIs defined in Snowpark pandas but not the Modin API layer.
 """
 
-from snowflake.snowpark.modin.plugin._internal.telemetry import (
-    snowpark_pandas_telemetry_method_decorator,
-)
-
 from .base_overrides import register_base_override
 
 
 @register_base_override("__array_function__")
-@snowpark_pandas_telemetry_method_decorator
 def __array_function__(self, func: callable, types: tuple, args: tuple, kwargs: dict):
     """
     Apply the `func` to the `BasePandasDataset`.