-
Notifications
You must be signed in to change notification settings - Fork 188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Correctly handle single type uniontypes in Coral #507
Changes from 4 commits
fe7754a
41accbf
5ddb3bc
7ef4103
7dc0116
a18673a
8207003
7bf6702
fb89e0f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
/** | ||
* Copyright 2022-2023 LinkedIn Corporation. All rights reserved. | ||
* Copyright 2022-2024 LinkedIn Corporation. All rights reserved. | ||
* Licensed under the BSD-2 Clause license. | ||
* See LICENSE in the project root for license information. | ||
*/ | ||
|
@@ -41,6 +41,12 @@ public class RelDataTypeToHiveTypeStringConverter { | |
private RelDataTypeToHiveTypeStringConverter() { | ||
} | ||
|
||
public RelDataTypeToHiveTypeStringConverter(boolean convertUnionTypes) { | ||
this.convertUnionTypes = convertUnionTypes; | ||
} | ||
|
||
private static boolean convertUnionTypes = false; | ||
|
||
/** | ||
* @param relDataType a given RelDataType | ||
* @return a syntactically and semantically correct Hive type string for relDataType | ||
|
@@ -110,6 +116,15 @@ public static String convertRelDataType(RelDataType relDataType) { | |
*/ | ||
private static String buildStructDataTypeString(RelRecordType relRecordType) { | ||
List<String> structFieldStrings = new ArrayList<>(); | ||
|
||
// Convert single uniontypes back to Hive representation so coalesce_struct UDF can handle | ||
// single uniontypes in Spark correctly | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Description/code in coral-common should not be custom to Spark or at least reference Spark, when it can be generic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed reference. |
||
if (convertUnionTypes && relRecordType.getFieldList().size() == 2 | ||
&& relRecordType.getFieldList().get(0).getName().equals("tag") | ||
&& relRecordType.getFieldList().get(1).getName().equals("field0")) { | ||
return String.format("uniontype<%s>", convertRelDataType(relRecordType.getFieldList().get(1).getType())); | ||
} | ||
|
||
for (RelDataTypeField fieldRelDataType : relRecordType.getFieldList()) { | ||
structFieldStrings | ||
.add(String.format("%s:%s", fieldRelDataType.getName(), convertRelDataType(fieldRelDataType.getType()))); | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
/** | ||
* Copyright 2018-2023 LinkedIn Corporation. All rights reserved. | ||
* Copyright 2018-2024 LinkedIn Corporation. All rights reserved. | ||
* Licensed under the BSD-2 Clause license. | ||
* See LICENSE in the project root for license information. | ||
*/ | ||
|
@@ -13,6 +13,7 @@ | |
import org.apache.calcite.plan.RelOptTable; | ||
import org.apache.calcite.plan.RelOptUtil; | ||
import org.apache.calcite.rel.RelNode; | ||
import org.apache.calcite.sql.SqlIdentifier; | ||
import org.apache.calcite.sql.SqlKind; | ||
import org.apache.calcite.sql.SqlNode; | ||
import org.apache.calcite.sql.SqlSelect; | ||
|
@@ -76,7 +77,7 @@ public static CoralSpark create(RelNode irRelNode, HiveMetastoreClient hmsClient | |
SparkRelInfo sparkRelInfo = IRRelToSparkRelTransformer.transform(irRelNode); | ||
Set<SparkUDFInfo> sparkUDFInfos = sparkRelInfo.getSparkUDFInfos(); | ||
RelNode sparkRelNode = sparkRelInfo.getSparkRelNode(); | ||
SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos); | ||
SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos, hmsClient); | ||
String sparkSQL = constructSparkSQL(sparkSqlNode); | ||
List<String> baseTables = constructBaseTables(sparkRelNode); | ||
return new CoralSpark(baseTables, ImmutableList.copyOf(sparkUDFInfos), sparkSQL, hmsClient, sparkSqlNode); | ||
|
@@ -101,30 +102,44 @@ private static CoralSpark createWithAlias(RelNode irRelNode, List<String> aliase | |
SparkRelInfo sparkRelInfo = IRRelToSparkRelTransformer.transform(irRelNode); | ||
Set<SparkUDFInfo> sparkUDFInfos = sparkRelInfo.getSparkUDFInfos(); | ||
RelNode sparkRelNode = sparkRelInfo.getSparkRelNode(); | ||
SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos); | ||
SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos, hmsClient); | ||
// Use a second pass visit to add explicit alias names, | ||
// only do this when it's not a select star case, | ||
// since for select star we don't need to add any explicit aliases | ||
if (sparkSqlNode.getKind() == SqlKind.SELECT && ((SqlSelect) sparkSqlNode).getSelectList() != null) { | ||
if (sparkSqlNode.getKind() == SqlKind.SELECT && !isSelectStar(sparkSqlNode)) { | ||
sparkSqlNode = sparkSqlNode.accept(new AddExplicitAlias(aliases)); | ||
} | ||
String sparkSQL = constructSparkSQL(sparkSqlNode); | ||
List<String> baseTables = constructBaseTables(sparkRelNode); | ||
return new CoralSpark(baseTables, ImmutableList.copyOf(sparkUDFInfos), sparkSQL, hmsClient, sparkSqlNode); | ||
} | ||
|
||
private static SqlNode constructSparkSqlNode(RelNode sparkRelNode, Set<SparkUDFInfo> sparkUDFInfos) { | ||
private static SqlNode constructSparkSqlNode(RelNode sparkRelNode, Set<SparkUDFInfo> sparkUDFInfos, | ||
HiveMetastoreClient hmsClient) { | ||
CoralRelToSqlNodeConverter rel2sql = new CoralRelToSqlNodeConverter(); | ||
SqlNode coralSqlNode = rel2sql.convert(sparkRelNode); | ||
SqlNode sparkSqlNode = coralSqlNode.accept(new CoralSqlNodeToSparkSqlNodeConverter()) | ||
.accept(new CoralToSparkSqlCallConverter(sparkUDFInfos)); | ||
|
||
SqlNode coralSqlNodeWithRelDataTypeDerivedConversions = | ||
coralSqlNode.accept(new DataTypeDerivedSqlCallConverter(hmsClient, coralSqlNode, sparkUDFInfos)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did not follow the intuition behind this (specifically in the context of single union types). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. During coral-spark RHS, RelNode -> SqlNode translation layer, there didn't used to be a need for transformers that require rel type derivation. However now, we need type derivation in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Incidentally this is a needed step for Coral IR upgrades, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need a discussion around this. Our objective here is to standardize There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline that we organize CoralRelNode -> LanguageSqlNode as 3 logical steps:
1 and 2 must be separated into 2 steps in that order as intermixing type derivation transformers with no type derivation transformers causes failures due to no type derivation transformers introducing certain operators that the type derivation util cannot yet handle. A future PR will be set up to refactor these steps into a well documented class that loops through the 3 |
||
|
||
SqlNode sparkSqlNode = coralSqlNodeWithRelDataTypeDerivedConversions | ||
.accept(new CoralSqlNodeToSparkSqlNodeConverter()).accept(new CoralToSparkSqlCallConverter(sparkUDFInfos)); | ||
return sparkSqlNode.accept(new SparkSqlRewriter()); | ||
} | ||
|
||
public static String constructSparkSQL(SqlNode sparkSqlNode) { | ||
return sparkSqlNode.toSqlString(SparkSqlDialect.INSTANCE).getSql(); | ||
} | ||
|
||
private static boolean isSelectStar(SqlNode node) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this function have to do with single unions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My apologies for not explaining earlier. After adding in type derivation transformations on coral spark RHS, there was a side-effect where the old detection for select star queries no longer worked requiring us to update the detection logic. This is a more robust check anyhow. |
||
if (node.getKind() == SqlKind.SELECT && ((SqlSelect) node).getSelectList().size() == 1 | ||
&& ((SqlSelect) node).getSelectList().get(0) instanceof SqlIdentifier) { | ||
SqlIdentifier identifier = (SqlIdentifier) ((SqlSelect) node).getSelectList().get(0); | ||
return identifier.isStar(); | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* This function returns the list of base table names, in the format | ||
* "database_name.table_name". | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/** | ||
* Copyright 2022-2024 LinkedIn Corporation. All rights reserved. | ||
* Licensed under the BSD-2 Clause license. | ||
* See LICENSE in the project root for license information. | ||
*/ | ||
package com.linkedin.coral.spark; | ||
|
||
import java.util.Set; | ||
|
||
import org.apache.calcite.sql.SqlCall; | ||
import org.apache.calcite.sql.SqlNode; | ||
import org.apache.calcite.sql.util.SqlShuttle; | ||
|
||
import com.linkedin.coral.common.HiveMetastoreClient; | ||
import com.linkedin.coral.common.transformers.SqlCallTransformers; | ||
import com.linkedin.coral.common.utils.TypeDerivationUtil; | ||
import com.linkedin.coral.hive.hive2rel.HiveToRelConverter; | ||
import com.linkedin.coral.spark.containers.SparkUDFInfo; | ||
import com.linkedin.coral.spark.transformers.ExtractUnionFunctionTransformer; | ||
|
||
|
||
/** | ||
* DataTypeDerivedSqlCallConverter transforms the sqlCalls | ||
* in the input SqlNode representation to be compatible with Trino engine. | ||
* The transformation may involve change in operator, reordering the operands | ||
* or even re-constructing the SqlNode. | ||
* | ||
* All the transformations performed as part of this shuttle require RelDataType derivation. | ||
*/ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This description is confusing (also class name is confusing). Why would a spark Shuttle do something specific to Trino? The description sounds too generic and arbitrary without specific contract. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class is a copy of
Forgot to change "Trino engine." to "Spark engine." after copying, fixed. |
||
public class DataTypeDerivedSqlCallConverter extends SqlShuttle { | ||
private final SqlCallTransformers operatorTransformerList; | ||
private final HiveToRelConverter toRelConverter; | ||
TypeDerivationUtil typeDerivationUtil; | ||
|
||
public DataTypeDerivedSqlCallConverter(HiveMetastoreClient mscClient, SqlNode topSqlNode, | ||
Set<SparkUDFInfo> sparkUDFInfos) { | ||
toRelConverter = new HiveToRelConverter(mscClient); | ||
typeDerivationUtil = new TypeDerivationUtil(toRelConverter.getSqlValidator(), topSqlNode); | ||
operatorTransformerList = | ||
SqlCallTransformers.of(new ExtractUnionFunctionTransformer(typeDerivationUtil, sparkUDFInfos)); | ||
} | ||
|
||
@Override | ||
public SqlNode visit(SqlCall call) { | ||
return operatorTransformerList.apply((SqlCall) super.visit(call)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add Java doc on what the effect of the parameter is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.