-
Notifications
You must be signed in to change notification settings - Fork 116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SNOW-841405 Fix df copy and enable basic diamond shaped joins for simplifier #1003
Changes from 4 commits
4327cb7
35a54dc
9a99bd7
40cf9e4
d2e1100
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -1118,23 +1118,23 @@ def test_select_columns_on_join_result_with_conflict_name(session): | |||
assert df4.collect() == [Row(3, 4, 1)] | ||||
|
||||
|
||||
def test_join_diamond_shape_error(session): | ||||
def test_nested_join_diamond_shape_error(session): | ||||
"""This is supposed to work but currently we don't handle it correctly. We should fix this with a good design.""" | ||||
df1 = session.create_dataframe([[1]], schema=["a"]) | ||||
df2 = session.create_dataframe([[1]], schema=["a"]) | ||||
df3 = df1.join(df2, df1["a"] == df2["a"]) | ||||
df4 = df3.select(df1["a"].as_("a")) | ||||
# df1["a"] and df4["a"] has the same expr_id in map expr_to_alias. When they join, only one will be in df5's alias | ||||
# map. It leaves the other one resolved to "a" instead of the alias. | ||||
df5 = df1.join(df4, df1["a"] == df4["a"]) | ||||
df5 = df1.join(df4, df1["a"] == df4["a"]) # (df1) JOIN ((df1 JOIN df2)->df4) | ||||
with pytest.raises( | ||||
SnowparkSQLAmbiguousJoinException, | ||||
match="The reference to the column 'A' is ambiguous.", | ||||
): | ||||
df5.collect() | ||||
|
||||
|
||||
def test_join_diamond_shape_workaround(session): | ||||
def test_nested_join_diamond_shape_workaround(session): | ||||
df1 = session.create_dataframe([[1]], schema=["a"]) | ||||
df2 = session.create_dataframe([[1]], schema=["a"]) | ||||
df3 = df1.join(df2, df1["a"] == df2["a"]) | ||||
|
@@ -1143,3 +1143,19 @@ def test_join_diamond_shape_workaround(session): | |||
df1_converted = df1.select(df1["a"]) | ||||
df5 = df1_converted.join(df4, df1_converted["a"] == df4["a"]) | ||||
Utils.check_answer(df5, [Row(1, 1)]) | ||||
|
||||
|
||||
def test_dataframe_basic_diamond_shaped_join(session): | ||||
df1 = session.create_dataframe([[1, 2], [3, 4], [5, 6]], schema=["a", "b"]) | ||||
df2 = df1.filter(col("a") > 1).with_column("c", lit(7)) | ||||
assert df1.a._expression.expr_id != df2.a._expression.expr_id | ||||
|
||||
Comment on lines
+1151
to
+1152
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we testing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If they are the same we won't be able to perform the join.
|
||||
# (df1) JOIN (df1->df2) | ||||
Utils.check_answer( | ||||
df1.join(df2, df1.a == df2.a).select(df1.a, df2.c), [Row(3, 7), Row(5, 7)] | ||||
) | ||||
|
||||
# (df1->df3) JOIN (df1-> df2) | ||||
df3 = df1.filter(col("b") < 6).with_column("d", lit(8)) | ||||
assert df2.b._expression.expr_id != df3.b._expression.expr_id | ||||
Utils.check_answer(df3.join(df2, df2.b == df3.b).select(df2.a, df3.d), [Row(3, 8)]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Curious what triggered this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is bug actually, I intended it to be what the type hints suggests: a defaultdict of dictionaries. This PR fixes dataframe.copy to work for simplifier path, so this is caught.