merge with base

snowflakedb · Sep 17, 2024 · 9f2c707 · 9f2c707
2 parents e5b3f83 + a737f33
commit 9f2c707
Show file tree

Hide file tree

Showing 94 changed files with 4,582 additions and 4,484 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,14 @@
 #### New Features
 
 - Added support for `TimedeltaIndex.mean` method.
+- Added support for some cases of aggregating `Timedelta` columns on `axis=0` with `agg` or `aggregate`.
+- Added support for `by`, `left_by`, `right_by`, `left_index`, and `right_index` for `pd.merge_asof`.
 
+#### Bug Fixes
+
+- Fixed a bug where an `Index` object created from a `Series`/`DataFrame` incorrectly updates the `Series`/`DataFrame`'s index name after an inplace update has been applied to the original `Series`/`DataFrame`.
+- Suppressed an unhelpful `SettingWithCopyWarning` that sometimes appeared when printing `Timedelta` columns.
+- Fixed `inplace` argument for `Series` objects derived from other `Series` objects.
 
 ## 1.22.1 (2024-09-11)
 This is a re-release of 1.22.0. Please refer to the 1.22.0 release notes for detailed release content.
@@ -124,6 +131,7 @@ This is a re-release of 1.22.0. Please refer to the 1.22.0 release notes for det
 - Added support for `Series.dt.total_seconds` method.
 - Added support for `DataFrame.apply(axis=0)`.
 - Added support for `Series.dt.tz_convert` and `Series.dt.tz_localize`.
+- Added support for `DatetimeIndex.tz_convert` and `DatetimeIndex.tz_localize`.
 
 #### Improvements
 

diff --git a/docs/source/modin/supported/datetime_index_supported.rst b/docs/source/modin/supported/datetime_index_supported.rst
@@ -82,9 +82,9 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``snap``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``tz_convert``              | N                               |                                  |                                                    |
+| ``tz_convert``              | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``tz_localize``             | N                               |                                  |                                                    |
+| ``tz_localize``             | P                               | ``ambiguous``, ``nonexistent``   |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``round``                   | P                               | ``ambiguous``, ``nonexistent``   |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -38,9 +38,7 @@ Data manipulations
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``merge``                   | P                               | ``validate``                     | ``N`` if param ``validate`` is given               |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``merge_asof``              | P                               | ``by``, ``left_by``, ``right_by``| ``N`` if param ``direction`` is ``nearest``.       |
-|                             |                                 | , ``left_index``, ``right_index``|                                                    |
-|                             |                                 | , ``suffixes``, ``tolerance``    |                                                    |
+| ``merge_asof``              | P                               | ``suffixes``, ``tolerance``      | ``N`` if param ``direction`` is ``nearest``        |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``merge_ordered``           | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/_internal/analyzer/analyzer.py b/src/snowflake/snowpark/_internal/analyzer/analyzer.py
@@ -979,10 +979,7 @@ def do_resolve_with_resolved_children(
                 schema_query = schema_query_for_values_statement(logical_plan.output)
 
             if logical_plan.data:
-                if (
-                    len(logical_plan.output) * len(logical_plan.data)
-                    < ARRAY_BIND_THRESHOLD
-                ):
+                if not logical_plan.is_large_local_data:
                     return self.plan_builder.query(
                         values_statement(logical_plan.output, logical_plan.data),
                         logical_plan,

diff --git a/src/snowflake/snowpark/_internal/analyzer/binary_expression.py b/src/snowflake/snowpark/_internal/analyzer/binary_expression.py
@@ -2,11 +2,12 @@
 # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
 #
 
-from typing import AbstractSet, Optional
+from typing import AbstractSet, List, Optional
 
 from snowflake.snowpark._internal.analyzer.expression import (
     Expression,
     derive_dependent_columns,
+    derive_dependent_columns_with_duplication,
 )
 from snowflake.snowpark._internal.analyzer.query_plan_analysis_utils import (
     PlanNodeCategory,
@@ -29,6 +30,9 @@ def __str__(self):
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.left, self.right)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.left, self.right)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.LOW_IMPACT

diff --git a/src/snowflake/snowpark/_internal/analyzer/expression.py b/src/snowflake/snowpark/_internal/analyzer/expression.py
@@ -35,6 +35,13 @@
 def derive_dependent_columns(
     *expressions: "Optional[Expression]",
 ) -> Optional[AbstractSet[str]]:
+    """
+    Given set of expressions, derive the set of columns that the expressions dependents on.
+
+    Note, the returned dependent columns is a set without duplication. For example, given expression
+    concat(col1, upper(co1), upper(col2)), the result will be {col1, col2} even if col1 has
+    occurred in the given expression twice.
+    """
     result = set()
     for exp in expressions:
         if exp is not None:
@@ -48,6 +55,23 @@ def derive_dependent_columns(
     return result
 
 
+def derive_dependent_columns_with_duplication(
+    *expressions: "Optional[Expression]",
+) -> List[str]:
+    """
+    Given set of expressions, derive the list of columns that the expression dependents on.
+
+    Note, the returned columns will have duplication if the column occurred more than once in
+    the given expression. For example, concat(col1, upper(co1), upper(col2)) will have result
+    [col1, col1, col2], where col1 occurred twice in the result.
+    """
+    result = []
+    for exp in expressions:
+        if exp is not None:
+            result.extend(exp.dependent_column_names_with_duplication())
+    return result
+
+
 class Expression:
     """Consider removing attributes, and adding properties and methods.
     A subclass of Expression may have no child, one child, or multiple children.
@@ -68,6 +92,9 @@ def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         # TODO: consider adding it to __init__ or use cached_property.
         return COLUMN_DEPENDENCY_EMPTY
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return []
+
     @property
     def pretty_name(self) -> str:
         """Returns a user-facing string representation of this expression's name.
@@ -143,6 +170,9 @@ def __init__(self, plan: "SnowflakePlan") -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return COLUMN_DEPENDENCY_DOLLAR
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return list(COLUMN_DEPENDENCY_DOLLAR)
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         return self.plan.cumulative_node_complexity
@@ -156,6 +186,9 @@ def __init__(self, expressions: List[Expression]) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(*self.expressions)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.expressions)
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         return sum_node_complexities(
@@ -172,6 +205,9 @@ def __init__(self, columns: Expression, values: List[Expression]) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.columns, *self.values)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.columns, *self.values)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.IN
@@ -212,6 +248,9 @@ def __str__(self):
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return {self.name}
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return [self.name]
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.COLUMN
@@ -235,6 +274,13 @@ def dependent_column_names(self) -> Optional[AbstractSet[str]]:
             else COLUMN_DEPENDENCY_ALL
         )
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return (
+            derive_dependent_columns_with_duplication(*self.expressions)
+            if self.expressions
+            else []  # we currently do not handle * dependency
+        )
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         complexity = {} if self.expressions else {PlanNodeCategory.COLUMN: 1}
@@ -278,6 +324,14 @@ def __hash__(self):
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return self._dependent_column_names
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return (
+            []
+            if (self._dependent_column_names == COLUMN_DEPENDENCY_ALL)
+            or (self._dependent_column_names is None)
+            else list(self._dependent_column_names)
+        )
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.COLUMN
@@ -371,6 +425,9 @@ def __init__(self, expr: Expression, pattern: Expression) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr, self.pattern)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr, self.pattern)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # expr LIKE pattern
@@ -400,6 +457,9 @@ def __init__(
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr, self.pattern)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr, self.pattern)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # expr REG_EXP pattern
@@ -423,6 +483,9 @@ def __init__(self, expr: Expression, collation_spec: str) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # expr COLLATE collate_spec
@@ -444,6 +507,9 @@ def __init__(self, expr: Expression, field: str) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # the literal corresponds to the contribution from self.field
@@ -466,6 +532,9 @@ def __init__(self, expr: Expression, field: int) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # the literal corresponds to the contribution from self.field
@@ -510,6 +579,9 @@ def sql(self) -> str:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(*self.children)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.children)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.FUNCTION
@@ -525,6 +597,9 @@ def __init__(self, expr: Expression, order_by_cols: List[Expression]) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.expr, *self.order_by_cols)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.expr, *self.order_by_cols)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         # expr WITHIN GROUP (ORDER BY cols)
@@ -549,13 +624,21 @@ def __init__(
         self.branches = branches
         self.else_value = else_value
 
-    def dependent_column_names(self) -> Optional[AbstractSet[str]]:
+    @property
+    def _child_expressions(self) -> List[Expression]:
         exps = []
         for exp_tuple in self.branches:
             exps.extend(exp_tuple)
         if self.else_value is not None:
             exps.append(self.else_value)
-        return derive_dependent_columns(*exps)
+
+        return exps
+
+    def dependent_column_names(self) -> Optional[AbstractSet[str]]:
+        return derive_dependent_columns(*self._child_expressions)
+
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self._child_expressions)
 
     @property
     def plan_node_category(self) -> PlanNodeCategory:
@@ -602,6 +685,9 @@ def __init__(
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(*self.children)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.children)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.FUNCTION
@@ -617,6 +703,9 @@ def __init__(self, col: Expression, delimiter: str, is_distinct: bool) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(self.col)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(self.col)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.FUNCTION
@@ -636,6 +725,9 @@ def __init__(self, exprs: List[Expression]) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(*self.exprs)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.exprs)
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         return sum_node_complexities(

diff --git a/src/snowflake/snowpark/_internal/analyzer/grouping_set.py b/src/snowflake/snowpark/_internal/analyzer/grouping_set.py
@@ -7,6 +7,7 @@
 from snowflake.snowpark._internal.analyzer.expression import (
     Expression,
     derive_dependent_columns,
+    derive_dependent_columns_with_duplication,
 )
 from snowflake.snowpark._internal.analyzer.query_plan_analysis_utils import (
     PlanNodeCategory,
@@ -23,6 +24,9 @@ def __init__(self, group_by_exprs: List[Expression]) -> None:
     def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         return derive_dependent_columns(*self.group_by_exprs)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        return derive_dependent_columns_with_duplication(*self.group_by_exprs)
+
     @property
     def plan_node_category(self) -> PlanNodeCategory:
         return PlanNodeCategory.LOW_IMPACT
@@ -45,6 +49,10 @@ def dependent_column_names(self) -> Optional[AbstractSet[str]]:
         flattened_args = [exp for sublist in self.args for exp in sublist]
         return derive_dependent_columns(*flattened_args)
 
+    def dependent_column_names_with_duplication(self) -> List[str]:
+        flattened_args = [exp for sublist in self.args for exp in sublist]
+        return derive_dependent_columns_with_duplication(*flattened_args)
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
         return sum_node_complexities(

diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan_node.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan_node.py
@@ -144,10 +144,27 @@ def __init__(
         self.data = data
         self.schema_query = schema_query
 
+    @property
+    def is_large_local_data(self) -> bool:
+        from snowflake.snowpark._internal.analyzer.analyzer import ARRAY_BIND_THRESHOLD
+
+        return len(self.data) * len(self.output) >= ARRAY_BIND_THRESHOLD
+
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
+        if self.is_large_local_data:
+            # When the number of literals exceeds the threshold, we generate 3 queries:
+            # 1. create table query
+            # 2. insert into table query
+            # 3. select * from table query
+            # We only consider the complexity from the final select * query since other queries
+            # are built based on it.
+            return {
+                PlanNodeCategory.COLUMN: 1,
+            }
+
+        # If we stay under the threshold, we generate a single query:
         # select $1, ..., $m FROM VALUES (r11, r12, ..., r1m), (rn1, ...., rnm)
-        # TODO: use ARRAY_BIND_THRESHOLD
         return {
             PlanNodeCategory.COLUMN: len(self.output),
             PlanNodeCategory.LITERAL: len(self.data) * len(self.output),