Merge branch 'main' into jrose_snow_1058583_iceberg_save

snowflakedb · Sep 3, 2024 · 001fb1d · 001fb1d
2 parents ffb9876 + 3c1db07
commit 001fb1d
Show file tree

Hide file tree

Showing 61 changed files with 9,988 additions and 4,874 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -76,6 +76,7 @@
   - support for lazy `TimedeltaIndex`.
   - support for `pd.to_timedelta`.
   - support for `GroupBy` aggregations `min`, `max`, `mean`, `idxmax`, `idxmin`, `std`, `sum`, `median`, `count`, `any`, `all`, `size`, `nunique`.
+  - support for `TimedeltaIndex` attributes: `days`, `seconds`, `microseconds` and `nanoseconds`.
 - Added support for index's arithmetic and comparison operators.
 - Added support for `Series.dt.round`.
 - Added documentation pages for `DatetimeIndex`.
@@ -89,16 +90,29 @@
 - Added support for `Index.is_boolean`, `Index.is_integer`, `Index.is_floating`, `Index.is_numeric`, and `Index.is_object`.
 - Added support for `DatetimeIndex.round`, `DatetimeIndex.floor` and `DatetimeIndex.ceil`.
 - Added support for `Series.dt.days_in_month` and `Series.dt.daysinmonth`.
+- Added support for `DataFrameGroupBy.value_counts` and `SeriesGroupBy.value_counts`.
+- Added support for `Series.is_monotonic_increasing` and `Series.is_monotonic_decreasing`.
+- Added support for `Index.is_monotonic_increasing` and `Index.is_monotonic_decreasing`.
+- Added support for `pd.crosstab`.
 
 #### Improvements
 
 - Refactored `quoted_identifier_to_snowflake_type` to avoid making metadata queries if the types have been cached locally.
+- Improved `pd.to_datetime` to handle all local input cases. 
 
 #### Bug Fixes
 
 - Stopped ignoring nanoseconds in `pd.Timedelta` scalars.
 - Fixed AssertionError in tree of binary operations.
 
+#### Behavior Change
+
+- When calling `DataFrame.set_index`, or setting `DataFrame.index` or `Series.index`, with a new index that does not match the current length of the `Series`/`DataFrame` object, a `ValueError` is no longer raised. When the `Series`/`DataFrame` object is longer than the new index, the `Series`/`DataFrame`'s new index is filled with `NaN` values for the "extra" elements. When the `Series`/`DataFrame` object is shorter than the new index, the extra values in the new index are ignored—`Series` and `DataFrame` stay the same length `n`, and use only the first `n` values of the new index.
+
+#### Improvements
+
+- Improve concat, join performance when operations are performed on series coming from the same dataframe by avoiding unnecessary joins.
+
 ## 1.21.0 (2024-08-19)
 
 ### Snowpark Python API Updates

diff --git a/docs/source/modin/general_functions.rst b/docs/source/modin/general_functions.rst
@@ -11,6 +11,7 @@ General functions
     :toctree: pandas_api/
 
     melt
+    crosstab
     pivot
     pivot_table
     cut

diff --git a/docs/source/modin/groupby.rst b/docs/source/modin/groupby.rst
@@ -59,6 +59,7 @@ GroupBy
     DataFrameGroupBy.std
     DataFrameGroupBy.sum
     DataFrameGroupBy.tail
+    DataFrameGroupBy.value_counts
     DataFrameGroupBy.var
 
 .. rubric:: `SeriesGroupBy` computations / descriptive stats
@@ -90,4 +91,5 @@ GroupBy
     SeriesGroupBy.std
     SeriesGroupBy.sum
     SeriesGroupBy.tail
+    SeriesGroupBy.value_counts
     SeriesGroupBy.var
diff --git a/docs/source/modin/series.rst b/docs/source/modin/series.rst
@@ -26,6 +26,8 @@ Series
     Series.equals
     Series.empty
     Series.hasnans
+    Series.is_monotonic_increasing
+    Series.is_monotonic_decreasing
     Series.name
     Series.ndim
     Series.shape

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -18,7 +18,10 @@ Data manipulations
 | ``concat``                  | P                               | ``levels`` is not supported,     |                                                    |
 |                             |                                 | ``copy`` is ignored              |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``crosstab``                | N                               |                                  |                                                    |
+| ``crosstab``                | P                               |                                  | ``N`` if ``aggfunc`` is not one of                 |
+|                             |                                 |                                  | "count", "mean", "min", "max", or "sum", or        |
+|                             |                                 |                                  | margins is True, normalize is "all" or True,       |
+|                             |                                 |                                  | and values is passed.                              |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``cut``                     | P                               | ``retbins``, ``labels``          | ``N`` if ``retbins=True``or ``labels!=False``      |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst
@@ -166,7 +166,7 @@ Computations/descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``take``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``value_counts``            | N                               |                                                    |
+| ``value_counts``            | P                               | ``N`` if ``bins`` is given for SeriesGroupBy       |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``var``                     | P                               | See ``std``                                        |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -20,9 +20,9 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``values``                  | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``is_monotonic_increasing`` | N                               |                                                    |
+| ``is_monotonic_increasing`` | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``is_monotonic_decreasing`` | N                               |                                                    |
+| ``is_monotonic_decreasing`` | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``is_unique``               | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/series_supported.rst b/docs/source/modin/supported/series_supported.rst
@@ -43,9 +43,9 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``index``                   | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``is_monotonic_decreasing`` | N                               |                                                    |
+| ``is_monotonic_decreasing`` | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``is_monotonic_increasing`` | N                               |                                                    |
+| ``is_monotonic_increasing`` | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``is_unique``               | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/docs/source/modin/supported/timedelta_index_supported.rst b/docs/source/modin/supported/timedelta_index_supported.rst
@@ -15,13 +15,13 @@ Attributes
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | TimedeltaIndex attribute    | Snowpark implemented? (Y/N/P/D) | Notes for current implementation                   |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``days``                    | N                               |                                                    |
+| ``days``                    | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``seconds``                 | N                               |                                                    |
+| ``seconds``                 | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``microseconds``            | N                               |                                                    |
+| ``microseconds``            | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``nanoseconds``             | N                               |                                                    |
+| ``nanoseconds``             | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``components``              | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+

diff --git a/src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py b/src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py
@@ -878,6 +878,9 @@ def create_table_as_select_statement(
     change_tracking: Optional[bool] = None,
     copy_grants: bool = False,
     iceberg_config: Optional[dict] = None,
+    *,
+    use_scoped_temp_objects: bool = False,
+    is_generated: bool = False,
 ) -> str:
     column_definition_sql = (
         f"{LEFT_PARENTHESIS}{column_definition}{RIGHT_PARENTHESIS}"
@@ -911,7 +914,8 @@ def create_table_as_select_statement(
         )
     options_statement = get_options_statement(options)
     return (
-        f"{CREATE}{OR + REPLACE if replace else EMPTY_STRING} {table_type.upper()} "
+        f"{CREATE}{OR + REPLACE if replace else EMPTY_STRING}"
+        f" {(get_temp_type_for_object(use_scoped_temp_objects, is_generated) if table_type.lower() in TEMPORARY_STRING_SET else table_type).upper()} "
         f"{ICEBERG if iceberg_config is not None else EMPTY_STRING}{TABLE}"
         f"{IF + NOT + EXISTS if not replace and not error else EMPTY_STRING} "
         f"{table_name}{column_definition_sql}{cluster_by_clause}{options_statement}"

diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -938,6 +938,8 @@ def get_create_table_as_select_plan(child: SnowflakePlan, replace, error):
                     change_tracking=change_tracking,
                     copy_grants=copy_grants,
                     iceberg_config=iceberg_config,
+                    use_scoped_temp_objects=use_scoped_temp_objects,
+                    is_generated=is_generated,
                 ),
                 child,
                 source_plan,

diff --git a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
@@ -6,8 +6,6 @@
 from collections import defaultdict
 from typing import List, Optional, Tuple
 
-from sortedcontainers import SortedList
-
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     drop_table_if_exists_statement,
 )
@@ -201,11 +199,11 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
 
             1. Traverse the plan tree and find the valid nodes for partitioning.
             2. If no valid node is found, return None.
-            3. Keep valid nodes in a sorted list based on the complexity score.
-            4. Return the node with the highest complexity score.
+            3. Return the node with the highest complexity score.
         """
         current_level = [root]
-        pipeline_breaker_list = SortedList(key=lambda x: x[0])
+        candidate_node = None
+        candidate_score = -1  # start with -1 since score is always > 0
 
         while current_level:
             next_level = []
@@ -215,23 +213,20 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
                     self._parent_map[child].add(node)
                     valid_to_breakdown, score = self._is_node_valid_to_breakdown(child)
                     if valid_to_breakdown:
-                        # Append score and child to the pipeline breaker sorted list
-                        # so that the valid child with the highest complexity score
-                        # is at the end of the list.
-                        pipeline_breaker_list.add((score, child))
+                        # If the score for valid node is higher than the last candidate,
+                        # update the candidate node and score.
+                        if score > candidate_score:
+                            candidate_score = score
+                            candidate_node = child
                     else:
                         # don't traverse subtrees if parent is a valid candidate
                         next_level.append(child)
 
             current_level = next_level
 
-        if not pipeline_breaker_list:
-            # Return None if no valid node is found for partitioning.
-            return None
-
-        # Get the node with the highest complexity score
-        _, child = pipeline_breaker_list.pop()
-        return child
+        # If no valid node is found, candidate_node will be None.
+        # Otherwise, return the node with the highest complexity score.
+        return candidate_node
 
     def _get_partitioned_plan(self, root: TreeNode, child: TreeNode) -> SnowflakePlan:
         """This method takes cuts the child out from the root, creates a temp table plan for the
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ General functions @@
         :toctree: pandas_api/
         melt
+        crosstab
         pivot
         pivot_table
         cut
@@ Expand Down @@