From 5e2334e2f2464e17a4d27b3065bba9a2f12f4165 Mon Sep 17 00:00:00 2001
From: Afroz Alam <afroz.alam@snowflake.com>
Date: Fri, 30 Aug 2024 09:15:40 -0700
Subject: [PATCH 1/3] Remove sortedcontainers dependency

---
 .../snowpark/_internal/compiler/large_query_breakdown.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
index 34d27862ced..465f2e3d8b4 100644
--- a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
+++ b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
@@ -6,8 +6,6 @@
 from collections import defaultdict
 from typing import List, Optional, Tuple
 
-from sortedcontainers import SortedList
-
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     drop_table_if_exists_statement,
 )
@@ -205,7 +203,7 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
             4. Return the node with the highest complexity score.
         """
         current_level = [root]
-        pipeline_breaker_list = SortedList(key=lambda x: x[0])
+        pipeline_breaker_list = []
 
         while current_level:
             next_level = []
@@ -218,7 +216,7 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
                         # Append score and child to the pipeline breaker sorted list
                         # so that the valid child with the highest complexity score
                         # is at the end of the list.
-                        pipeline_breaker_list.add((score, child))
+                        pipeline_breaker_list.append((score, child))
                     else:
                         # don't traverse subtrees if parent is a valid candidate
                         next_level.append(child)
@@ -230,7 +228,8 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
             return None
 
         # Get the node with the highest complexity score
-        _, child = pipeline_breaker_list.pop()
+        sorted_pipeline_breaker_list = sorted(pipeline_breaker_list, key=lambda x: x[0])
+        _, child = sorted_pipeline_breaker_list.pop()
         return child
 
     def _get_partitioned_plan(self, root: TreeNode, child: TreeNode) -> SnowflakePlan:

From 72455e0e8d0f2ae544228faa583345b1f9aa9d5d Mon Sep 17 00:00:00 2001
From: Afroz Alam <afroz.alam@snowflake.com>
Date: Fri, 30 Aug 2024 09:22:47 -0700
Subject: [PATCH 2/3] Improve algorithm

---
 .../compiler/large_query_breakdown.py         | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
index 465f2e3d8b4..5707d71dc33 100644
--- a/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
+++ b/src/snowflake/snowpark/_internal/compiler/large_query_breakdown.py
@@ -199,11 +199,11 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
 
             1. Traverse the plan tree and find the valid nodes for partitioning.
             2. If no valid node is found, return None.
-            3. Keep valid nodes in a sorted list based on the complexity score.
-            4. Return the node with the highest complexity score.
+            3. Return the node with the highest complexity score.
         """
         current_level = [root]
-        pipeline_breaker_list = []
+        candidate_node = None
+        candidate_score = -1  # start with -1 since score is always > 0
 
         while current_level:
             next_level = []
@@ -213,24 +213,20 @@ def _find_node_to_breakdown(self, root: TreeNode) -> Optional[TreeNode]:
                     self._parent_map[child].add(node)
                     valid_to_breakdown, score = self._is_node_valid_to_breakdown(child)
                     if valid_to_breakdown:
-                        # Append score and child to the pipeline breaker sorted list
-                        # so that the valid child with the highest complexity score
-                        # is at the end of the list.
-                        pipeline_breaker_list.append((score, child))
+                        # If the score for valid node is higher than the last candidate,
+                        # update the candidate node and score.
+                        if score > candidate_score:
+                            candidate_score = score
+                            candidate_node = child
                     else:
                         # don't traverse subtrees if parent is a valid candidate
                         next_level.append(child)
 
             current_level = next_level
 
-        if not pipeline_breaker_list:
-            # Return None if no valid node is found for partitioning.
-            return None
-
-        # Get the node with the highest complexity score
-        sorted_pipeline_breaker_list = sorted(pipeline_breaker_list, key=lambda x: x[0])
-        _, child = sorted_pipeline_breaker_list.pop()
-        return child
+        # If no valid node is found, candidate_node will be None.
+        # Otherwise, return the node with the highest complexity score.
+        return candidate_node
 
     def _get_partitioned_plan(self, root: TreeNode, child: TreeNode) -> SnowflakePlan:
         """This method takes cuts the child out from the root, creates a temp table plan for the

From d29f60c4123f5199455c6c14c220b9857cdb7e3a Mon Sep 17 00:00:00 2001
From: Afroz Alam <afroz.alam@snowflake.com>
Date: Fri, 30 Aug 2024 09:32:33 -0700
Subject: [PATCH 3/3] Add test

---
 tests/integ/test_large_query_breakdown.py | 34 +++++++++++++++--------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/tests/integ/test_large_query_breakdown.py b/tests/integ/test_large_query_breakdown.py
index 1368bf460f2..72ade31d456 100644
--- a/tests/integ/test_large_query_breakdown.py
+++ b/tests/integ/test_large_query_breakdown.py
@@ -47,6 +47,7 @@ def setup(session):
     cte_optimization_enabled = session._cte_optimization_enabled
     is_query_compilation_stage_enabled = session._query_compilation_stage_enabled
     session._query_compilation_stage_enabled = True
+    session._large_query_breakdown_enabled = True
     yield
     session._query_compilation_stage_enabled = is_query_compilation_stage_enabled
     session._cte_optimization_enabled = cte_optimization_enabled
@@ -77,11 +78,32 @@ def check_result_with_and_without_breakdown(session, df):
         session._large_query_breakdown_enabled = large_query_enabled
 
 
+def test_no_valid_nodes_found(session, large_query_df, caplog):
+    """Test large query breakdown works with default bounds"""
+    set_bounds(300, 600)
+
+    base_df = session.sql("select 1 as A, 2 as B")
+    df1 = base_df.with_column("A", col("A") + lit(1))
+    df2 = base_df.with_column("B", col("B") + lit(1))
+
+    for i in range(102):
+        df1 = df1.with_column("A", col("A") + lit(i))
+        df2 = df2.with_column("B", col("B") + lit(i))
+
+    union_df = df1.union_all(df2)
+    final_df = union_df.with_column("A", col("A") + lit(1))
+
+    with caplog.at_level(logging.DEBUG):
+        queries = final_df.queries
+    assert len(queries["queries"]) == 1, queries["queries"]
+    assert len(queries["post_actions"]) == 0, queries["post_actions"]
+    assert "Could not find a valid node for partitioning" in caplog.text
+
+
 def test_large_query_breakdown_with_cte_optimization(session):
     """Test large query breakdown works with cte optimized plan"""
     set_bounds(300, 600)
     session._cte_optimization_enabled = True
-    session._large_query_breakdown_enabled = True
     df0 = session.sql("select 2 as b, 32 as c")
     df1 = session.sql("select 1 as a, 2 as b").filter(col("a") == 1)
     df1 = df1.join(df0, on=["b"], how="inner")
@@ -108,7 +130,6 @@ def test_large_query_breakdown_with_cte_optimization(session):
 
 def test_save_as_table(session, large_query_df):
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     table_name = Utils.random_table_name()
     with session.query_history() as history:
         large_query_df.write.save_as_table(table_name, mode="overwrite")
@@ -164,7 +185,6 @@ def test_update_delete_merge(session, large_query_df):
 
 def test_copy_into_location(session, large_query_df):
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     remote_file_path = f"{session.get_session_stage()}/df.parquet"
     with session.query_history() as history:
         large_query_df.write.copy_into_location(
@@ -183,7 +203,6 @@ def test_copy_into_location(session, large_query_df):
 
 def test_pivot_unpivot(session):
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     session.sql(
         """create or replace temp table monthly_sales(A int, B int, month text)
                 as select * from values
@@ -223,7 +242,6 @@ def test_pivot_unpivot(session):
 
 def test_sort(session):
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     base_df = session.sql("select 1 as A, 2 as B")
     df1 = base_df.with_column("A", col("A") + lit(1))
     df2 = base_df.with_column("B", col("B") + lit(1))
@@ -258,7 +276,6 @@ def test_sort(session):
 def test_multiple_query_plan(session, large_query_df):
     set_bounds(300, 600)
     original_threshold = analyzer.ARRAY_BIND_THRESHOLD
-    session._large_query_breakdown_enabled = True
     try:
         analyzer.ARRAY_BIND_THRESHOLD = 2
         base_df = session.create_dataframe([[1, 2], [3, 4]], schema=["A", "B"])
@@ -296,7 +313,6 @@ def test_multiple_query_plan(session, large_query_df):
 def test_optimization_skipped_with_transaction(session, large_query_df, caplog):
     """Test large query breakdown is skipped when transaction is enabled"""
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     session.sql("begin").collect()
     assert Utils.is_active_transaction(session)
     with caplog.at_level(logging.DEBUG):
@@ -316,7 +332,6 @@ def test_optimization_skipped_with_views_and_dynamic_tables(session, caplog):
     source_table = Utils.random_table_name()
     table_name = Utils.random_table_name()
     view_name = Utils.random_view_name()
-    session._large_query_breakdown_enabled = True
     try:
         session.sql("select 1 as a, 2 as b").write.save_as_table(source_table)
         df = session.table(source_table)
@@ -344,7 +359,6 @@ def test_optimization_skipped_with_views_and_dynamic_tables(session, caplog):
 def test_async_job_with_large_query_breakdown(session, large_query_df):
     """Test large query breakdown gives same result for async and non-async jobs"""
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     job = large_query_df.collect(block=False)
     result = job.result()
     assert result == large_query_df.collect()
@@ -362,7 +376,6 @@ def test_complexity_bounds_affect_num_partitions(session, large_query_df):
     Also test that when partitions are added, drop table queries are added.
     """
     set_bounds(300, 600)
-    session._large_query_breakdown_enabled = True
     assert len(large_query_df.queries["queries"]) == 2
     assert len(large_query_df.queries["post_actions"]) == 1
     assert large_query_df.queries["queries"][0].startswith("CREATE  TEMP  TABLE")
@@ -371,7 +384,6 @@ def test_complexity_bounds_affect_num_partitions(session, large_query_df):
     )
 
     set_bounds(300, 412)
-    session._large_query_breakdown_enabled = True
     assert len(large_query_df.queries["queries"]) == 3
     assert len(large_query_df.queries["post_actions"]) == 2
     assert large_query_df.queries["queries"][0].startswith("CREATE  TEMP  TABLE")