Merge branch 'main' into aalam-SNOW-1418500-add-side-effect-to-resolv…

…e_packages
snowflakedb · Aug 28, 2024 · 6f1c1e7 · 6f1c1e7
2 parents 1903091 + e83e700
commit 6f1c1e7
Show file tree

Hide file tree

Showing 35 changed files with 1,751 additions and 252 deletions.
diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
@@ -85,7 +85,7 @@ jobs:
       matrix:
         os: [macos-latest, windows-latest-64-cores, ubuntu-latest-64-cores]
         python-version: ["3.9", "3.10", "3.11"]
-        cloud-provider: [aws, azure, gcp]
+        cloud-provider: [aws, gcp] # TODO: SNOW-1643374 add azure back
         exclude:
           # only run macos with aws py3.9 for doctest
           - os: macos-latest
@@ -309,7 +309,7 @@ jobs:
       matrix:
         os: [macos-latest, windows-latest-64-cores, ubuntu-latest-64-cores]
         python-version: [ "3.9", "3.10", "3.11" ]
-        cloud-provider: [aws, azure, gcp]
+        cloud-provider: [aws, gcp] # TODO: SNOW-1643374 add azure back
         exclude:
           # only run macos with aws py3.9 for doctest
           - os: macos-latest

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,14 @@
 
 ### Snowpark Python API Updates
 
+### New Features
+
+- Added following new functions in `snowflake.snowpark.functions`:
+  - `array_remove`
+  - `ln`
+
 #### Improvements
 
-- Added support for function `functions.ln`
 - Added support for specifying the following to `DataFrameWriter.save_as_table`:
   - `enable_schema_evolution`
   - `data_retention_time`
@@ -45,13 +50,15 @@
 
 #### New Features
 
-- Added limited support for the `Timedelta` type, including
+- Added limited support for the `Timedelta` type, including the following features. Snowpark pandas will raise `NotImplementedError` for unsupported `Timedelta` use cases.
   - supporting tracking the Timedelta type through `copy`, `cache_result`, `shift`, `sort_index`.
-  - converting non-timedelta to timedelta via `astype`. 
+  - converting non-timedelta to timedelta via `astype`.
   - `NotImplementedError` will be raised for the rest of methods that do not support `Timedelta`.
   - support for subtracting two timestamps to get a Timedelta.
-  - support indexing with Timedelta data columns. 
+  - support indexing with Timedelta data columns.
   - support for adding or subtracting timestamps and `Timedelta`.
+  - support for binary arithmetic between two `Timedelta` values.
+  - support for lazy `TimedeltaIndex`.
 - Added support for index's arithmetic and comparison operators.
 - Added support for `Series.dt.round`.
 - Added documentation pages for `DatetimeIndex`.

diff --git a/docs/source/modin/indexing.rst b/docs/source/modin/indexing.rst
@@ -220,3 +220,43 @@ DatetimeIndex
 
     DatetimeIndex.mean
     DatetimeIndex.std
+
+.. _api.timedeltaindex:
+
+TimedeltaIndex
+--------------
+
+.. autosummary::
+   :toctree: pandas_api/
+
+   TimedeltaIndex
+
+.. rubric:: `TimedeltaIndex` Components
+
+.. autosummary::
+    :toctree: pandas_api/
+
+    TimedeltaIndex.days
+    TimedeltaIndex.seconds
+    TimedeltaIndex.microseconds
+    TimedeltaIndex.nanoseconds
+    TimedeltaIndex.components
+    TimedeltaIndex.inferred_freq
+
+.. rubric:: `TimedeltaIndex` Conversion
+
+.. autosummary::
+    :toctree: pandas_api/
+
+    TimedeltaIndex.as_unit
+    TimedeltaIndex.to_pytimedelta
+    TimedeltaIndex.round
+    TimedeltaIndex.floor
+    TimedeltaIndex.ceil
+
+.. rubric:: `TimedeltaIndex` Methods
+
+.. autosummary::
+    :toctree: pandas_api/
+
+    TimedeltaIndex.mean
diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst
@@ -98,8 +98,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``assign``                  | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``astype``                  | P                               |                                  | ``N``: from string to datetime or ``errors ==      |
-|                             |                                 |                                  | "ignore"``                                         |
+| ``astype``                  | P                               |                                  | ``N`` if from string to datetime/timedelta or      |
+|                             |                                 |                                  | ``errors == "ignore"``                             |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``at_time``                 | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/index.rst b/docs/source/modin/supported/index.rst
@@ -16,6 +16,7 @@ To view the docs for the most recent release, check that you’re viewing the st
    dataframe_supported
    index_supported
    datetime_index_supported
+   timedelta_index_supported
    window_supported
    groupby_supported
    resampling_supported

diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst
@@ -105,8 +105,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``asof``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``astype``                  | P                               |                                  | ``N``: from string to datetime or ``errors ==      |
-|                             |                                 |                                  | "ignore"``                                         |
+| ``astype``                  | P                               |                                  | ``N`` if from string to datetime/timedelta or      |
+|                             |                                 |                                  | ``errors == "ignore"``                             |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``at_time``                 | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/timedelta_index_supported.rst b/docs/source/modin/supported/timedelta_index_supported.rst
@@ -0,0 +1,48 @@
+``pd.TimedeltaIndex`` supported APIs
+====================================
+
+The following table is structured as follows: The first column contains the method name.
+The second column is a flag for whether or not there is an implementation in Snowpark for
+the method in the left column.
+
+.. note::
+    ``Y`` stands for yes, i.e., supports distributed implementation, ``N`` stands for no and API simply errors out,
+    ``P`` stands for partial (meaning some parameters may not be supported yet), and ``D`` stands for defaults to single
+    node pandas execution via UDF/Sproc.
+
+Attributes
+
++-----------------------------+---------------------------------+----------------------------------------------------+
+| TimedeltaIndex attribute    | Snowpark implemented? (Y/N/P/D) | Notes for current implementation                   |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``days``                    | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``seconds``                 | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``microseconds``            | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``nanoseconds``             | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``components``              | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+| ``inferred_freq``           | N                               |                                                    |
++-----------------------------+---------------------------------+----------------------------------------------------+
+
+
+Methods
+
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| DataFrame method            | Snowpark implemented? (Y/N/P/D) | Missing parameters               | Notes for current implementation          |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``as_unit``                 | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``to_pytimedelta``          | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``round``                   | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``floor``                   | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``ceil``                    | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
+| ``mean``                    | N                               |                                  |                                           |
++-----------------------------+---------------------------------+----------------------------------+-------------------------------------------+
diff --git a/docs/source/snowpark/functions.rst b/docs/source/snowpark/functions.rst
@@ -43,6 +43,7 @@ Functions
     array_min
     array_position
     array_prepend
+    array_remove
     array_size
     array_slice
     array_sort

diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -1608,6 +1608,9 @@ def with_query_block(
 
         new_query = project_statement([], name)
 
+        # note we do not propagate the query parameter of the child here,
+        # the query parameter will be propagate along with the definition during
+        # query generation stage.
         queries = child.queries[:-1] + [Query(sql=new_query)]
         # propagate the cte table
         referenced_ctes = {name}.union(child.referenced_ctes)

diff --git a/src/snowflake/snowpark/_internal/compiler/query_generator.py b/src/snowflake/snowpark/_internal/compiler/query_generator.py
@@ -64,7 +64,7 @@ def __init__(
         # NOTE: the dict used here is an ordered dict, all with query block definition is recorded in the
         # order of when the with query block is visited. The order is important to make sure the dependency
         # between the CTE definition is satisfied.
-        self.resolved_with_query_block: Dict[str, str] = {}
+        self.resolved_with_query_block: Dict[str, Query] = {}
 
     def generate_queries(
         self, logical_plans: List[LogicalPlan]
@@ -209,7 +209,7 @@ def do_resolve_with_resolved_children(
             if logical_plan.name not in self.resolved_with_query_block:
                 self.resolved_with_query_block[
                     logical_plan.name
-                ] = resolved_child.queries[-1].sql
+                ] = resolved_child.queries[-1]
 
             resolved_plan = self.plan_builder.with_query_block(
                 logical_plan.name,

diff --git a/src/snowflake/snowpark/_internal/compiler/utils.py b/src/snowflake/snowpark/_internal/compiler/utils.py
@@ -227,6 +227,9 @@ def update_resolvable_node(
         # re-calculation of the sql query and snowflake plan
         node._sql_query = None
         node._snowflake_plan = None
+        # make sure we also clean up the cached _projection_in_str, so that
+        # the projection expression can be re-analyzed during code generation
+        node._projection_in_str = None
         node.analyzer = query_generator
 
         # update the pre_actions and post_actions for the select statement
@@ -267,12 +270,26 @@ def update_resolvable_node(
         update_resolvable_node(node.snowflake_plan, query_generator)
         node.analyzer = query_generator
 
+        node.pre_actions = node._snowflake_plan.queries[:-1]
+        node.post_actions = node._snowflake_plan.post_actions
+        node._api_calls = node._snowflake_plan.api_calls
+
+        if isinstance(node, SelectSnowflakePlan):
+            node.expr_to_alias.update(node._snowflake_plan.expr_to_alias)
+            node.df_aliased_col_name_to_real_col_name.update(
+                node._snowflake_plan.df_aliased_col_name_to_real_col_name
+            )
+            node._query_params = []
+            for query in node._snowflake_plan.queries:
+                if query.params:
+                    node._query_params.extend(query.params)
+
     elif isinstance(node, Selectable):
         node.analyzer = query_generator
 
 
 def get_snowflake_plan_queries(
-    plan: SnowflakePlan, resolved_with_query_blocks: Dict[str, str]
+    plan: SnowflakePlan, resolved_with_query_blocks: Dict[str, Query]
 ) -> Dict[PlanQueryType, List[Query]]:
 
     from snowflake.snowpark._internal.analyzer.analyzer_utils import cte_statement
@@ -286,12 +303,16 @@ def get_snowflake_plan_queries(
         post_action_queries = copy.deepcopy(plan.post_actions)
         table_names = []
         definition_queries = []
+        final_query_params = []
         for name, definition_query in resolved_with_query_blocks.items():
             if name in plan.referenced_ctes:
                 table_names.append(name)
-                definition_queries.append(definition_query)
+                definition_queries.append(definition_query.sql)
+                final_query_params.extend(definition_query.params)
         with_query = cte_statement(definition_queries, table_names)
         plan_queries[-1].sql = with_query + plan_queries[-1].sql
+        final_query_params.extend(plan_queries[-1].params)
+        plan_queries[-1].params = final_query_params
 
     return {
         PlanQueryType.QUERIES: plan_queries,

diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py
@@ -5333,6 +5333,56 @@ def array_append(array: ColumnOrName, element: ColumnOrName) -> Column:
     return builtin("array_append")(a, e)
 
 
+def array_remove(array: ColumnOrName, element: ColumnOrLiteral) -> Column:
+    """Given a source ARRAY, returns an ARRAY with elements of the specified value removed.
+
+    Args:
+        array: name of column containing array.
+        element: element to be removed from the array. If the element is a VARCHAR, it needs
+            to be casted into VARIANT data type.
+
+    Examples::
+        >>> from snowflake.snowpark.types import VariantType
+        >>> df = session.create_dataframe([([1, '2', 3.1, 1, 1],)], ['data'])
+        >>> df.select(array_remove(df.data, 1).alias("objects")).show()
+        -------------
+        |"OBJECTS"  |
+        -------------
+        |[          |
+        |  "2",     |
+        |  3.1      |
+        |]          |
+        -------------
+        <BLANKLINE>
+
+        >>> df.select(array_remove(df.data, lit('2').cast(VariantType())).alias("objects")).show()
+        -------------
+        |"OBJECTS"  |
+        -------------
+        |[          |
+        |  1,       |
+        |  3.1,     |
+        |  1,       |
+        |  1        |
+        |]          |
+        -------------
+        <BLANKLINE>
+
+        >>> df.select(array_remove(df.data, None).alias("objects")).show()
+        -------------
+        |"OBJECTS"  |
+        -------------
+        |NULL       |
+        -------------
+        <BLANKLINE>
+
+    See Also:
+        - `ARRAY <https://docs.snowflake.com/en/sql-reference/data-types-semistructured#label-data-type-array>`_ for more details on semi-structured arrays.
+    """
+    a = _to_col_if_str(array, "array_remove")
+    return builtin("array_remove")(a, element)
+
+
 def array_cat(array1: ColumnOrName, array2: ColumnOrName) -> Column:
     """Returns the concatenation of two ARRAYs.
 

diff --git a/src/snowflake/snowpark/modin/pandas/__init__.py b/src/snowflake/snowpark/modin/pandas/__init__.py
@@ -63,7 +63,6 @@
         SparseDtype,
         StringDtype,
         Timedelta,
-        TimedeltaIndex,
         Timestamp,
         UInt8Dtype,
         UInt16Dtype,
@@ -156,6 +155,7 @@
 from snowflake.snowpark.modin.plugin.extensions.pd_overrides import (  # isort: skip  # noqa: E402,F401
     Index,
     DatetimeIndex,
+    TimedeltaIndex,
 )
 import snowflake.snowpark.modin.plugin.extensions.base_overrides  # isort: skip  # noqa: E402,F401
 import snowflake.snowpark.modin.plugin.extensions.dataframe_extensions  # isort: skip  # noqa: E402,F401