From bbf5f1587d7f6cb40dc396d6ff20a64136ea8306 Mon Sep 17 00:00:00 2001 From: Jiefan Li Date: Thu, 27 Jun 2024 18:03:11 -0400 Subject: [PATCH 01/12] Register Spark UDFs (#513) --- .../hive2rel/functions/StaticHiveFunctionRegistry.java | 9 ++++++++- .../coral/coralservice/metastore/MetastoreProvider.java | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java index c9e9fd67b..95eeceed5 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -673,6 +673,13 @@ public boolean isOptional(int i) { createAddUserDefinedFunction("com.linkedin.policy.decoration.udfs.RedactSecondarySchemaFieldIf", ARG1, family( SqlTypeFamily.BOOLEAN, SqlTypeFamily.ANY, SqlTypeFamily.ARRAY, SqlTypeFamily.CHARACTER, SqlTypeFamily.ANY)); + createAddUserDefinedFunction("com.linkedin.groot.runtime.udf.spark.HasMemberConsentUDF", ReturnTypes.BOOLEAN, + family(SqlTypeFamily.STRING, SqlTypeFamily.ANY, SqlTypeFamily.TIMESTAMP)); + createAddUserDefinedFunction("com.linkedin.groot.runtime.udf.spark.RedactFieldIfUDF", ARG1, + family(SqlTypeFamily.BOOLEAN, SqlTypeFamily.ANY, SqlTypeFamily.STRING, SqlTypeFamily.ANY)); + createAddUserDefinedFunction("com.linkedin.groot.runtime.udf.spark.RedactSecondarySchemaFieldIfUDF", ARG1, family( + SqlTypeFamily.BOOLEAN, SqlTypeFamily.ANY, SqlTypeFamily.ARRAY, SqlTypeFamily.STRING, SqlTypeFamily.STRING)); + // UDTFs addFunctionEntry("explode", new CoralSqlUnnestOperator(false)); addFunctionEntry("posexplode", new CoralSqlUnnestOperator(true)); diff --git a/coral-service/src/main/java/com/linkedin/coral/coralservice/metastore/MetastoreProvider.java b/coral-service/src/main/java/com/linkedin/coral/coralservice/metastore/MetastoreProvider.java index dce48db44..1f0a81b06 100644 --- a/coral-service/src/main/java/com/linkedin/coral/coralservice/metastore/MetastoreProvider.java +++ b/coral-service/src/main/java/com/linkedin/coral/coralservice/metastore/MetastoreProvider.java @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2022-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ From 185b9acb7deafb1cc3c555adc2bc3b5cfb9561c4 Mon Sep 17 00:00:00 2001 From: Jiefan Li Date: Tue, 9 Jul 2024 14:07:54 -0400 Subject: [PATCH 02/12] Use Sonatype token username and token password (#515) --- .github/workflows/ci.yml | 4 ++-- gradle/shipkit.gradle | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2ce0a3629..22a2c7d1b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: run: ./gradlew githubRelease publishToSonatype closeAndReleaseStagingRepository --stacktrace env: GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} - SONATYPE_USER: ${{secrets.SONATYPE_USER}} - SONATYPE_PWD: ${{secrets.SONATYPE_PWD}} + SONATYPE_TOKEN_USERNAME: ${{secrets.SONATYPE_TOKEN_USERNAME}} + SONATYPE_TOKEN_PASSWORD: ${{secrets.SONATYPE_TOKEN_PASSWORD}} PGP_KEY: ${{secrets.PGP_KEY}} PGP_PWD: ${{secrets.PGP_PWD}} diff --git a/gradle/shipkit.gradle b/gradle/shipkit.gradle index 95ea8990c..5d79d5c1e 100644 --- a/gradle/shipkit.gradle +++ b/gradle/shipkit.gradle @@ -21,10 +21,10 @@ tasks.named("githubRelease") { apply plugin: "io.github.gradle-nexus.publish-plugin" //https://github.com/gradle-nexus/publish-plugin/ nexusPublishing { repositories { - if (System.getenv("SONATYPE_PWD")) { + if (System.getenv("SONATYPE_TOKEN_PASSWORD")) { sonatype { - username = System.getenv("SONATYPE_USER") - password = System.getenv("SONATYPE_PWD") + username = System.getenv("SONATYPE_TOKEN_USERNAME") + password = System.getenv("SONATYPE_TOKEN_PASSWORD") nexusUrl.set(uri("https://s01.oss.sonatype.org/service/local/")) snapshotRepositoryUrl.set(uri("https://s01.oss.sonatype.org/content/repositories/snapshots/")) } From f93b1c3773ac239b33a9b9f2887a2eed7231debb Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Tue, 16 Jul 2024 17:03:31 -0400 Subject: [PATCH 03/12] [Coral-Trino] Fix redundant transforms/casts from GenericProjectTransformer (#512) * fix redundant transforms/casts * spotless * fix fuzzyunionsqlrewriter incorrect struct type when building structs * remove unused * spotless * spotless --- .../com/linkedin/coral/common/FuzzyUnionSqlRewriter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/coral-common/src/main/java/com/linkedin/coral/common/FuzzyUnionSqlRewriter.java b/coral-common/src/main/java/com/linkedin/coral/common/FuzzyUnionSqlRewriter.java index c2b8400bd..e57c19927 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/FuzzyUnionSqlRewriter.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/FuzzyUnionSqlRewriter.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -237,7 +237,8 @@ private RelDataType getUnionDataType(final List dataTypes) { if (baseDataType.isStruct()) { // Build the common UNION type using the first branch that appears in the query final RelDataTypeFactory.Builder builder = - new RelDataTypeFactory.Builder(toRelConverter.getRelBuilder().getTypeFactory()); + new RelDataTypeFactory.Builder(toRelConverter.getRelBuilder().getTypeFactory()) + .kind(baseDataType.getStructKind()); // Build a set of common fields by name in the given dataTypes Set commonFieldNames = From 96e24e01b78dd241e2085fcf79d8d9fc9031be4b Mon Sep 17 00:00:00 2001 From: Pratham Date: Thu, 18 Jul 2024 21:37:45 -0400 Subject: [PATCH 04/12] Add function in static hive registry (#519) --- .../hive/hive2rel/functions/StaticHiveFunctionRegistry.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java index 95eeceed5..c7029f035 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java @@ -679,6 +679,8 @@ public boolean isOptional(int i) { family(SqlTypeFamily.BOOLEAN, SqlTypeFamily.ANY, SqlTypeFamily.STRING, SqlTypeFamily.ANY)); createAddUserDefinedFunction("com.linkedin.groot.runtime.udf.spark.RedactSecondarySchemaFieldIfUDF", ARG1, family( SqlTypeFamily.BOOLEAN, SqlTypeFamily.ANY, SqlTypeFamily.ARRAY, SqlTypeFamily.STRING, SqlTypeFamily.STRING)); + createAddUserDefinedFunction("com.linkedin.groot.runtime.udf.spark.GetMappedValueUDF", FunctionReturnTypes.STRING, + family(SqlTypeFamily.STRING, SqlTypeFamily.STRING)); // UDTFs addFunctionEntry("explode", new CoralSqlUnnestOperator(false)); From f817412f45abaefd5a57996c2a9dc11b9b34071b Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 24 Jul 2024 15:07:20 -0400 Subject: [PATCH 05/12] rename (#518) --- ...onvertletTable.java => CoralConvertletTable.java} | 12 +++++++----- .../coral/hive/hive2rel/HiveToRelConverter.java | 2 +- .../coral/trino/trino2rel/TrinoToRelConverter.java | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) rename coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/{HiveConvertletTable.java => CoralConvertletTable.java} (79%) diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveConvertletTable.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java similarity index 79% rename from coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveConvertletTable.java rename to coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java index 5b82a8595..774e81d0f 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveConvertletTable.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -20,10 +20,11 @@ /** - * ConvertletTable for Hive Operators + * ConvertletTable for transformations only relevant to Coral's Intermediate Representation, not specific + * any SQL dialect. These transformations keep data parity between the SqlNode and RelNode layer, keeping the IR intact. * @see ReflectiveConvertletTable documentation for method naming and visibility rules */ -public class HiveConvertletTable extends ReflectiveConvertletTable { +public class CoralConvertletTable extends ReflectiveConvertletTable { @SuppressWarnings("unused") public RexNode convertFunctionFieldReferenceOperator(SqlRexContext cx, FunctionFieldReferenceOperator op, @@ -33,14 +34,15 @@ public RexNode convertFunctionFieldReferenceOperator(SqlRexContext cx, FunctionF return cx.getRexBuilder().makeFieldAccess(funcExpr, fieldName, false); } + /** + * Override {@link StandardConvertletTable#convertCast} to avoid cast optimizations that remove the cast. + */ @SuppressWarnings("unused") public RexNode convertCast(SqlRexContext cx, SqlCastFunction cast, SqlCall call) { final SqlNode left = call.operand(0); RexNode leftRex = cx.convertExpression(left); SqlDataTypeSpec dataType = call.operand(1); RelDataType castType = dataType.deriveType(cx.getValidator(), true); - // can not call RexBuilder.makeCast() since that optimizes to remove the cast - // we don't want to remove the cast return cx.getRexBuilder().makeAbstractCast(castType, leftRex); } diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverter.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverter.java index 1b6b5b31d..f19b61800 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverter.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverter.java @@ -70,7 +70,7 @@ public HiveFunctionResolver getFunctionResolver() { @Override protected SqlRexConvertletTable getConvertletTable() { - return new HiveConvertletTable(); + return new CoralConvertletTable(); } @Override diff --git a/coral-trino/src/main/java/com/linkedin/coral/trino/trino2rel/TrinoToRelConverter.java b/coral-trino/src/main/java/com/linkedin/coral/trino/trino2rel/TrinoToRelConverter.java index 2fc7ea6d6..25c2b6436 100644 --- a/coral-trino/src/main/java/com/linkedin/coral/trino/trino2rel/TrinoToRelConverter.java +++ b/coral-trino/src/main/java/com/linkedin/coral/trino/trino2rel/TrinoToRelConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -24,8 +24,8 @@ import com.linkedin.coral.common.HiveMetastoreClient; import com.linkedin.coral.common.HiveRelBuilder; import com.linkedin.coral.common.ToRelConverter; +import com.linkedin.coral.hive.hive2rel.CoralConvertletTable; import com.linkedin.coral.hive.hive2rel.DaliOperatorTable; -import com.linkedin.coral.hive.hive2rel.HiveConvertletTable; import com.linkedin.coral.hive.hive2rel.HiveSqlValidator; import com.linkedin.coral.hive.hive2rel.functions.HiveFunctionResolver; import com.linkedin.coral.hive.hive2rel.functions.StaticHiveFunctionRegistry; @@ -63,7 +63,7 @@ public TrinoToRelConverter(Map>> localMetaStore @Override protected SqlRexConvertletTable getConvertletTable() { - return new HiveConvertletTable(); + return new CoralConvertletTable(); } @Override From d1d5b1ea0925b02c0506e794b061e30bb8976d5f Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Thu, 25 Jul 2024 12:05:10 -0400 Subject: [PATCH 06/12] Add missing numeric precisions in Hive type system (#501) * add precision for int and bigint for hive type system * use static final variables for precisions * spotless check * fix converting type to typespec bug * add UT * add tinyint and smallint precisions * spotless * spotless * spotless * new UTs * add tableInt to rel to trino tests * spotless * remove redundant test --- .../linkedin/coral/common/HiveTypeSystem.java | 16 +++++++- .../hive/hive2rel/HiveToRelConverterTest.java | 40 ++++++++++++++++++- .../coral/hive/hive2rel/TestUtils.java | 2 + .../FromUtcTimestampOperatorTransformer.java | 12 +----- .../coral/trino/rel2trino/TestUtils.java | 3 ++ 5 files changed, 60 insertions(+), 13 deletions(-) diff --git a/coral-common/src/main/java/com/linkedin/coral/common/HiveTypeSystem.java b/coral-common/src/main/java/com/linkedin/coral/common/HiveTypeSystem.java index 32ecd526d..6372a220b 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/HiveTypeSystem.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/HiveTypeSystem.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -12,7 +12,7 @@ import org.apache.calcite.sql.type.SqlTypeUtil; -// Copied from Hive source code +// Precision and scale values copied from Hive source code public class HiveTypeSystem extends RelDataTypeSystemImpl { // TODO: This should come from type system; Currently there is no definition // in type system for this. @@ -29,6 +29,10 @@ public class HiveTypeSystem extends RelDataTypeSystemImpl { private static final int DEFAULT_CHAR_PRECISION = 255; private static final int MAX_BINARY_PRECISION = Integer.MAX_VALUE; private static final int MAX_TIMESTAMP_PRECISION = 9; + private static final int DEFAULT_TINYINT_PRECISION = 3; + private static final int DEFAULT_SMALLINT_PRECISION = 5; + private static final int DEFAULT_INTEGER_PRECISION = 10; + private static final int DEFAULT_BIGINT_PRECISION = 19; @Override public int getMaxScale(SqlTypeName typeName) { @@ -84,6 +88,14 @@ public int getDefaultPrecision(SqlTypeName typeName) { case INTERVAL_MINUTE_SECOND: case INTERVAL_SECOND: return SqlTypeName.DEFAULT_INTERVAL_START_PRECISION; + case TINYINT: + return DEFAULT_TINYINT_PRECISION; + case SMALLINT: + return DEFAULT_SMALLINT_PRECISION; + case INTEGER: + return DEFAULT_INTEGER_PRECISION; + case BIGINT: + return DEFAULT_BIGINT_PRECISION; default: return -1; } diff --git a/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverterTest.java b/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverterTest.java index d8f047777..e2eafb0e7 100644 --- a/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverterTest.java +++ b/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/HiveToRelConverterTest.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -622,6 +622,44 @@ public void testNameSakeColumnNamesShouldGetUniqueIdentifiers() { assertEquals(generated, expected); } + @Test + public void testUnionIntAndBigInt() { + final String sql = "SELECT a FROM test.tableOne UNION ALL SELECT bigint_col FROM test.tableInt"; + RelNode rel = converter.convertSql(sql); + assertEquals(rel.getRowType().getFieldCount(), 1); + // Should be BIGINT since it is a less restrictive type than INT + assertEquals(rel.getRowType().getFieldList().get(0).getType().getSqlTypeName(), SqlTypeName.BIGINT); + } + + @Test + public void testUnionIntAndSmallInt() { + final String sql = "SELECT smallint_col FROM test.tableInt UNION ALL SELECT a FROM test.tableOne"; + RelNode rel = converter.convertSql(sql); + assertEquals(rel.getRowType().getFieldCount(), 1); + // Should be INT since it is a less restrictive type than SMALLINT + assertEquals(rel.getRowType().getFieldList().get(0).getType().getSqlTypeName(), SqlTypeName.INTEGER); + } + + @Test + public void testUnionIntAndTinyInt() { + final String sql = "SELECT tinyint_col FROM test.tableInt UNION ALL SELECT a FROM test.tableOne"; + RelNode rel = converter.convertSql(sql); + assertEquals(rel.getRowType().getFieldCount(), 1); + // Should be INT since it is a less restrictive type than TINYINT + assertEquals(rel.getRowType().getFieldList().get(0).getType().getSqlTypeName(), SqlTypeName.INTEGER); + } + + @Test + public void testIntCastToBigIntDuringComparison() { + // We're testing that a comparison between INT and BIGINT sees a cast on the more restrictive type to the + // less restrictive type and not the other way around. In other words, the INT is cast to BIGINT. + final String sql = "SELECT CASE WHEN int_col = bigint_col THEN 'abc' ELSE 'def' END FROM test.tableInt"; + String expected = "LogicalProject(EXPR$0=[CASE(=(CAST($2):BIGINT, $3), 'abc', 'def')])\n" + + " LogicalTableScan(table=[[hive, test, tableint]])\n"; + + assertEquals(relToString(sql), expected); + } + private String relToString(String sql) { return RelOptUtil.toString(converter.convertSql(sql)); } diff --git a/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/TestUtils.java b/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/TestUtils.java index 8a1a437c4..5a6a9fe0a 100644 --- a/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/TestUtils.java +++ b/coral-hive/src/test/java/com/linkedin/coral/hive/hive2rel/TestUtils.java @@ -82,6 +82,8 @@ public static TestHive setupDefaultHive(HiveConf conf) throws IOException { driver.run("CREATE DATABASE IF NOT EXISTS test"); driver.run("CREATE TABLE IF NOT EXISTS test.tableOne(a int, b varchar(30), c double, d timestamp)"); driver.run("CREATE TABLE IF NOT EXISTS test.tableTwo(x int, y double)"); + driver.run( + "CREATE TABLE IF NOT EXISTS test.tableInt(tinyint_col tinyint, smallint_col smallint, int_col int, bigint_col bigint)"); driver.run("CREATE DATABASE IF NOT EXISTS fuzzy_union"); diff --git a/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/transformers/FromUtcTimestampOperatorTransformer.java b/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/transformers/FromUtcTimestampOperatorTransformer.java index 81e30535f..eee27b50b 100644 --- a/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/transformers/FromUtcTimestampOperatorTransformer.java +++ b/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/transformers/FromUtcTimestampOperatorTransformer.java @@ -11,15 +11,13 @@ import java.util.Set; import org.apache.calcite.rel.type.RelDataType; -import org.apache.calcite.sql.SqlBasicTypeNameSpec; import org.apache.calcite.sql.SqlCall; -import org.apache.calcite.sql.SqlDataTypeSpec; import org.apache.calcite.sql.SqlNode; import org.apache.calcite.sql.SqlOperator; -import org.apache.calcite.sql.SqlTypeNameSpec; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.BasicSqlType; import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.type.SqlTypeUtil; import com.linkedin.coral.common.HiveTypeSystem; import com.linkedin.coral.common.calcite.CalciteUtil; @@ -122,12 +120,6 @@ protected SqlCall transform(SqlCall sqlCall) { } private SqlCall castOperand(SqlNode operand, RelDataType relDataType) { - return SqlStdOperatorTable.CAST.createCall(ZERO, operand, getSqlDataTypeSpecForCasting(relDataType)); - } - - private SqlDataTypeSpec getSqlDataTypeSpecForCasting(RelDataType relDataType) { - final SqlTypeNameSpec typeNameSpec = new SqlBasicTypeNameSpec(relDataType.getSqlTypeName(), - relDataType.getPrecision(), relDataType.getScale(), null, ZERO); - return new SqlDataTypeSpec(typeNameSpec, ZERO); + return SqlStdOperatorTable.CAST.createCall(ZERO, operand, SqlTypeUtil.convertTypeToSpec(relDataType)); } } diff --git a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/TestUtils.java b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/TestUtils.java index 148085261..3e6f088f8 100644 --- a/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/TestUtils.java +++ b/coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/TestUtils.java @@ -425,6 +425,9 @@ public static void initializeTablesAndViews(HiveConf conf) throws HiveException, run(driver, "CREATE TABLE IF NOT EXISTS test.tableFour(icol int, scol string, acol array, mcol map)"); + run(driver, + "CREATE TABLE IF NOT EXISTS test.tableInt(tinyint_col tinyint, smallint_col smallint, int_col int, bigint_col bigint)"); + } public static HiveConf loadResourceHiveConf() { From 74c2ca82dfbdec406230001320b3a347ef96b628 Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 31 Jul 2024 16:58:50 -0400 Subject: [PATCH 07/12] Correctly handle single type uniontypes in Coral (#507) * fix single uniontypes in Coral * remove SingleUnionFieldReferenceTransformer * remove field reference fix to put in separate PR * spotless * update comments * fix comment + add single uniontype test for RelDataTypeToHiveTypeStringConverter * spotless * improve Javadoc for ExtractUnionFunctionTransformer * use html brackets in javadoc --- .../linkedin/coral/common/TypeConverter.java | 5 +- .../FunctionFieldReferenceOperator.java | 11 +-- .../RelDataTypeToHiveTypeStringConverter.java | 29 ++++++- ...TypeToHiveDataTypeStringConverterTest.java | 16 +++- .../HiveSqlNodeToCoralSqlNodeConverter.java | 6 +- .../SingleUnionFieldReferenceTransformer.java | 49 ----------- .../com/linkedin/coral/spark/CoralSpark.java | 29 +++++-- .../spark/CoralToSparkSqlCallConverter.java | 6 +- .../DataTypeDerivedSqlCallConverter.java | 47 +++++++++++ .../ExtractUnionFunctionTransformer.java | 82 ++++++++++++++++++- .../linkedin/coral/spark/CoralSparkTest.java | 26 +++++- .../com/linkedin/coral/spark/TestUtils.java | 4 +- 12 files changed, 224 insertions(+), 86 deletions(-) delete mode 100644 coral-hive/src/main/java/com/linkedin/coral/transformers/SingleUnionFieldReferenceTransformer.java create mode 100644 coral-spark/src/main/java/com/linkedin/coral/spark/DataTypeDerivedSqlCallConverter.java diff --git a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java index 183b4730b..022538eaa 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -147,9 +147,6 @@ public static RelDataType convert(StructTypeInfo structType, final RelDataTypeFa public static RelDataType convert(UnionTypeInfo unionType, RelDataTypeFactory dtFactory) { List fTypes = unionType.getAllUnionObjectTypeInfos().stream() .map(typeInfo -> convert(typeInfo, dtFactory)).collect(Collectors.toList()); - if (fTypes.size() == 1) { - return dtFactory.createTypeWithNullability(fTypes.get(0), true); - } List fNames = IntStream.range(0, unionType.getAllUnionObjectTypeInfos().size()).mapToObj(i -> "field" + i) .collect(Collectors.toList()); fTypes.add(0, dtFactory.createSqlType(SqlTypeName.INTEGER)); diff --git a/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java b/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java index ec4e125ea..f947da078 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -74,15 +74,6 @@ public RelDataType deriveType(SqlValidator validator, SqlValidatorScope scope, S if (funcType.isStruct()) { return funcType.getField(fieldNameStripQuotes(call.operand(1)), false, false).getType(); } - - // When the first operand is a SqlBasicCall with a non-struct RelDataType and the second operand is `tag_0`, - // such as `extract_union`(`product`.`value`).`tag_0` or (`extract_union`(`product`.`value`).`id`).`tag_0`, - // derived data type is first operand's RelDataType. - // This strategy ensures that RelDataType derivation remains successful for the specified sqlCalls while maintaining backward compatibility. - // Such SqlCalls are transformed {@link com.linkedin.coral.transformers.SingleUnionFieldReferenceTransformer} - if (FunctionFieldReferenceOperator.fieldNameStripQuotes(call.operand(1)).equalsIgnoreCase("tag_0")) { - return funcType; - } } return super.deriveType(validator, scope, call); } diff --git a/coral-common/src/main/java/com/linkedin/coral/common/utils/RelDataTypeToHiveTypeStringConverter.java b/coral-common/src/main/java/com/linkedin/coral/common/utils/RelDataTypeToHiveTypeStringConverter.java index 1aea5a064..c5a652af2 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/utils/RelDataTypeToHiveTypeStringConverter.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/utils/RelDataTypeToHiveTypeStringConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2022-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2022-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -41,6 +41,25 @@ public class RelDataTypeToHiveTypeStringConverter { private RelDataTypeToHiveTypeStringConverter() { } + public RelDataTypeToHiveTypeStringConverter(boolean convertUnionTypes) { + this.convertUnionTypes = convertUnionTypes; + } + + /** + * If true, Coral will convert single uniontypes back to Hive's native uniontype representation. This is necessary + * because some engines have readers that unwrap Hive single uniontypes to just the underlying data type, causing + * the loss of information that the column was originally a uniontype in Hive. This can be problematic when calling + * the `coalesce_struct` UDF on such columns, as they are expected to be treated as uniontypes. Retaining the + * original uniontype record and passing it into `coalesce_struct` ensures correct handling. + * + * Example: + * RelDataType: + * struct(tag:integer,field0:varchar) + * Hive Type String: + * uniontype<string> + */ + private static boolean convertUnionTypes = false; + /** * @param relDataType a given RelDataType * @return a syntactically and semantically correct Hive type string for relDataType @@ -110,6 +129,14 @@ public static String convertRelDataType(RelDataType relDataType) { */ private static String buildStructDataTypeString(RelRecordType relRecordType) { List structFieldStrings = new ArrayList<>(); + + // Convert single uniontypes as structs back to native Hive representation + if (convertUnionTypes && relRecordType.getFieldList().size() == 2 + && relRecordType.getFieldList().get(0).getName().equals("tag") + && relRecordType.getFieldList().get(1).getName().equals("field0")) { + return String.format("uniontype<%s>", convertRelDataType(relRecordType.getFieldList().get(1).getType())); + } + for (RelDataTypeField fieldRelDataType : relRecordType.getFieldList()) { structFieldStrings .add(String.format("%s:%s", fieldRelDataType.getName(), convertRelDataType(fieldRelDataType.getType()))); diff --git a/coral-common/src/test/java/com/linkedin/coral/common/utils/RelDataTypeToHiveDataTypeStringConverterTest.java b/coral-common/src/test/java/com/linkedin/coral/common/utils/RelDataTypeToHiveDataTypeStringConverterTest.java index e2c5ca637..0ac88bcc9 100644 --- a/coral-common/src/test/java/com/linkedin/coral/common/utils/RelDataTypeToHiveDataTypeStringConverterTest.java +++ b/coral-common/src/test/java/com/linkedin/coral/common/utils/RelDataTypeToHiveDataTypeStringConverterTest.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -178,4 +178,18 @@ public void testCharRelDataType() { assertEquals(hiveDataTypeSchemaString, expectedHiveDataTypeSchemaString); } + + @Test + public void testSingleUniontypeStructRelDataType() { + String expectedHiveDataTypeSchemaString = "uniontype"; + + List fields = new ArrayList<>(); + fields.add(new RelDataTypeFieldImpl("tag", 0, new BasicSqlType(RelDataTypeSystem.DEFAULT, SqlTypeName.INTEGER))); + fields.add(new RelDataTypeFieldImpl("field0", 0, new BasicSqlType(RelDataTypeSystem.DEFAULT, SqlTypeName.VARCHAR))); + + RelRecordType relRecordType = new RelRecordType(fields); + String hiveDataTypeSchemaString = new RelDataTypeToHiveTypeStringConverter(true).convertRelDataType(relRecordType); + + assertEquals(hiveDataTypeSchemaString, expectedHiveDataTypeSchemaString); + } } diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlNodeToCoralSqlNodeConverter.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlNodeToCoralSqlNodeConverter.java index ab2fd8c32..8525a62f8 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlNodeToCoralSqlNodeConverter.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlNodeToCoralSqlNodeConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -13,7 +13,6 @@ import com.linkedin.coral.common.transformers.SqlCallTransformers; import com.linkedin.coral.common.utils.TypeDerivationUtil; import com.linkedin.coral.transformers.ShiftArrayIndexTransformer; -import com.linkedin.coral.transformers.SingleUnionFieldReferenceTransformer; /** @@ -24,8 +23,7 @@ public class HiveSqlNodeToCoralSqlNodeConverter extends SqlShuttle { public HiveSqlNodeToCoralSqlNodeConverter(SqlValidator sqlValidator, SqlNode topSqlNode) { TypeDerivationUtil typeDerivationUtil = new TypeDerivationUtil(sqlValidator, topSqlNode); - operatorTransformerList = SqlCallTransformers.of(new ShiftArrayIndexTransformer(typeDerivationUtil), - new SingleUnionFieldReferenceTransformer(typeDerivationUtil)); + operatorTransformerList = SqlCallTransformers.of(new ShiftArrayIndexTransformer(typeDerivationUtil)); } @Override diff --git a/coral-hive/src/main/java/com/linkedin/coral/transformers/SingleUnionFieldReferenceTransformer.java b/coral-hive/src/main/java/com/linkedin/coral/transformers/SingleUnionFieldReferenceTransformer.java deleted file mode 100644 index 32c721789..000000000 --- a/coral-hive/src/main/java/com/linkedin/coral/transformers/SingleUnionFieldReferenceTransformer.java +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright 2023 LinkedIn Corporation. All rights reserved. - * Licensed under the BSD-2 Clause license. - * See LICENSE in the project root for license information. - */ -package com.linkedin.coral.transformers; - -import org.apache.calcite.sql.SqlBasicCall; -import org.apache.calcite.sql.SqlCall; - -import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; -import com.linkedin.coral.common.transformers.SqlCallTransformer; -import com.linkedin.coral.common.utils.TypeDerivationUtil; - - -/** - * This transformer focuses on SqlCalls that involve a FunctionFieldReferenceOperator with the following characteristics: - * (1) The first operand is a SqlBasicCall with a non-struct RelDataType, and the second operand is tag_0. - * This indicates that the first operand represents a Union data type with a single data type inside. - * (2) Examples of such SqlCalls include extract_union(product.value).tag_0 or (extract_union(product.value).id).tag_0. - * (3) The transformation for such SqlCalls is to return the first operand. - */ -public class SingleUnionFieldReferenceTransformer extends SqlCallTransformer { - private static final String TAG_0_OPERAND = "tag_0"; - - public SingleUnionFieldReferenceTransformer(TypeDerivationUtil typeDerivationUtil) { - super(typeDerivationUtil); - } - - @Override - protected boolean condition(SqlCall sqlCall) { - if (FunctionFieldReferenceOperator.DOT.getName().equalsIgnoreCase(sqlCall.getOperator().getName())) { - if (sqlCall.operand(0) instanceof SqlBasicCall) { - SqlBasicCall outerSqlBasicCall = sqlCall.operand(0); - boolean isOperandStruct = deriveRelDatatype(outerSqlBasicCall).isStruct(); - - return !isOperandStruct - && FunctionFieldReferenceOperator.fieldNameStripQuotes(sqlCall.operand(1)).equalsIgnoreCase(TAG_0_OPERAND); - } - } - return false; - } - - @Override - protected SqlCall transform(SqlCall sqlCall) { - // convert x.tag_0 -> x where x is a sqlBasicCall with non-struct RelDataType - return sqlCall.operand(0); - } -} diff --git a/coral-spark/src/main/java/com/linkedin/coral/spark/CoralSpark.java b/coral-spark/src/main/java/com/linkedin/coral/spark/CoralSpark.java index 8ffba3f71..9ef1a23f2 100644 --- a/coral-spark/src/main/java/com/linkedin/coral/spark/CoralSpark.java +++ b/coral-spark/src/main/java/com/linkedin/coral/spark/CoralSpark.java @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -13,6 +13,7 @@ import org.apache.calcite.plan.RelOptTable; import org.apache.calcite.plan.RelOptUtil; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.sql.SqlIdentifier; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlNode; import org.apache.calcite.sql.SqlSelect; @@ -76,7 +77,7 @@ public static CoralSpark create(RelNode irRelNode, HiveMetastoreClient hmsClient SparkRelInfo sparkRelInfo = IRRelToSparkRelTransformer.transform(irRelNode); Set sparkUDFInfos = sparkRelInfo.getSparkUDFInfos(); RelNode sparkRelNode = sparkRelInfo.getSparkRelNode(); - SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos); + SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos, hmsClient); String sparkSQL = constructSparkSQL(sparkSqlNode); List baseTables = constructBaseTables(sparkRelNode); return new CoralSpark(baseTables, ImmutableList.copyOf(sparkUDFInfos), sparkSQL, hmsClient, sparkSqlNode); @@ -101,11 +102,11 @@ private static CoralSpark createWithAlias(RelNode irRelNode, List aliase SparkRelInfo sparkRelInfo = IRRelToSparkRelTransformer.transform(irRelNode); Set sparkUDFInfos = sparkRelInfo.getSparkUDFInfos(); RelNode sparkRelNode = sparkRelInfo.getSparkRelNode(); - SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos); + SqlNode sparkSqlNode = constructSparkSqlNode(sparkRelNode, sparkUDFInfos, hmsClient); // Use a second pass visit to add explicit alias names, // only do this when it's not a select star case, // since for select star we don't need to add any explicit aliases - if (sparkSqlNode.getKind() == SqlKind.SELECT && ((SqlSelect) sparkSqlNode).getSelectList() != null) { + if (sparkSqlNode.getKind() == SqlKind.SELECT && !isSelectStar(sparkSqlNode)) { sparkSqlNode = sparkSqlNode.accept(new AddExplicitAlias(aliases)); } String sparkSQL = constructSparkSQL(sparkSqlNode); @@ -113,11 +114,16 @@ private static CoralSpark createWithAlias(RelNode irRelNode, List aliase return new CoralSpark(baseTables, ImmutableList.copyOf(sparkUDFInfos), sparkSQL, hmsClient, sparkSqlNode); } - private static SqlNode constructSparkSqlNode(RelNode sparkRelNode, Set sparkUDFInfos) { + private static SqlNode constructSparkSqlNode(RelNode sparkRelNode, Set sparkUDFInfos, + HiveMetastoreClient hmsClient) { CoralRelToSqlNodeConverter rel2sql = new CoralRelToSqlNodeConverter(); SqlNode coralSqlNode = rel2sql.convert(sparkRelNode); - SqlNode sparkSqlNode = coralSqlNode.accept(new CoralSqlNodeToSparkSqlNodeConverter()) - .accept(new CoralToSparkSqlCallConverter(sparkUDFInfos)); + + SqlNode coralSqlNodeWithRelDataTypeDerivedConversions = + coralSqlNode.accept(new DataTypeDerivedSqlCallConverter(hmsClient, coralSqlNode, sparkUDFInfos)); + + SqlNode sparkSqlNode = coralSqlNodeWithRelDataTypeDerivedConversions + .accept(new CoralSqlNodeToSparkSqlNodeConverter()).accept(new CoralToSparkSqlCallConverter(sparkUDFInfos)); return sparkSqlNode.accept(new SparkSqlRewriter()); } @@ -125,6 +131,15 @@ public static String constructSparkSQL(SqlNode sparkSqlNode) { return sparkSqlNode.toSqlString(SparkSqlDialect.INSTANCE).getSql(); } + private static boolean isSelectStar(SqlNode node) { + if (node.getKind() == SqlKind.SELECT && ((SqlSelect) node).getSelectList().size() == 1 + && ((SqlSelect) node).getSelectList().get(0) instanceof SqlIdentifier) { + SqlIdentifier identifier = (SqlIdentifier) ((SqlSelect) node).getSelectList().get(0); + return identifier.isStar(); + } + return false; + } + /** * This function returns the list of base table names, in the format * "database_name.table_name". diff --git a/coral-spark/src/main/java/com/linkedin/coral/spark/CoralToSparkSqlCallConverter.java b/coral-spark/src/main/java/com/linkedin/coral/spark/CoralToSparkSqlCallConverter.java index c8a86d35c..5170507c8 100644 --- a/coral-spark/src/main/java/com/linkedin/coral/spark/CoralToSparkSqlCallConverter.java +++ b/coral-spark/src/main/java/com/linkedin/coral/spark/CoralToSparkSqlCallConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2023 LinkedIn Corporation. All rights reserved. + * Copyright 2023-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -15,7 +15,6 @@ import com.linkedin.coral.common.transformers.OperatorRenameSqlCallTransformer; import com.linkedin.coral.common.transformers.SqlCallTransformers; import com.linkedin.coral.spark.containers.SparkUDFInfo; -import com.linkedin.coral.spark.transformers.ExtractUnionFunctionTransformer; import com.linkedin.coral.spark.transformers.FallBackToLinkedInHiveUDFTransformer; import com.linkedin.coral.spark.transformers.FuzzyUnionGenericProjectTransformer; import com.linkedin.coral.spark.transformers.TransportUDFTransformer; @@ -157,9 +156,6 @@ public CoralToSparkSqlCallConverter(Set sparkUDFInfos) { // Fall back to the original Hive UDF defined in StaticHiveFunctionRegistry after failing to apply transformers above new FallBackToLinkedInHiveUDFTransformer(sparkUDFInfos), - // Transform `extract_union` to `coalesce_struct` - new ExtractUnionFunctionTransformer(sparkUDFInfos), - // Transform `generic_project` function new FuzzyUnionGenericProjectTransformer(sparkUDFInfos)); } diff --git a/coral-spark/src/main/java/com/linkedin/coral/spark/DataTypeDerivedSqlCallConverter.java b/coral-spark/src/main/java/com/linkedin/coral/spark/DataTypeDerivedSqlCallConverter.java new file mode 100644 index 000000000..079031a7f --- /dev/null +++ b/coral-spark/src/main/java/com/linkedin/coral/spark/DataTypeDerivedSqlCallConverter.java @@ -0,0 +1,47 @@ +/** + * Copyright 2022-2024 LinkedIn Corporation. All rights reserved. + * Licensed under the BSD-2 Clause license. + * See LICENSE in the project root for license information. + */ +package com.linkedin.coral.spark; + +import java.util.Set; + +import org.apache.calcite.sql.SqlCall; +import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.util.SqlShuttle; + +import com.linkedin.coral.common.HiveMetastoreClient; +import com.linkedin.coral.common.transformers.SqlCallTransformers; +import com.linkedin.coral.common.utils.TypeDerivationUtil; +import com.linkedin.coral.hive.hive2rel.HiveToRelConverter; +import com.linkedin.coral.spark.containers.SparkUDFInfo; +import com.linkedin.coral.spark.transformers.ExtractUnionFunctionTransformer; + + +/** + * DataTypeDerivedSqlCallConverter transforms the sqlCalls + * in the input SqlNode representation to be compatible with Spark engine. + * The transformation may involve change in operator, reordering the operands + * or even re-constructing the SqlNode. + * + * All the transformations performed as part of this shuttle require RelDataType derivation. + */ +public class DataTypeDerivedSqlCallConverter extends SqlShuttle { + private final SqlCallTransformers operatorTransformerList; + private final HiveToRelConverter toRelConverter; + TypeDerivationUtil typeDerivationUtil; + + public DataTypeDerivedSqlCallConverter(HiveMetastoreClient mscClient, SqlNode topSqlNode, + Set sparkUDFInfos) { + toRelConverter = new HiveToRelConverter(mscClient); + typeDerivationUtil = new TypeDerivationUtil(toRelConverter.getSqlValidator(), topSqlNode); + operatorTransformerList = + SqlCallTransformers.of(new ExtractUnionFunctionTransformer(typeDerivationUtil, sparkUDFInfos)); + } + + @Override + public SqlNode visit(SqlCall call) { + return operatorTransformerList.apply((SqlCall) super.visit(call)); + } +} diff --git a/coral-spark/src/main/java/com/linkedin/coral/spark/transformers/ExtractUnionFunctionTransformer.java b/coral-spark/src/main/java/com/linkedin/coral/spark/transformers/ExtractUnionFunctionTransformer.java index 27d6884b1..9572bcbbd 100644 --- a/coral-spark/src/main/java/com/linkedin/coral/spark/transformers/ExtractUnionFunctionTransformer.java +++ b/coral-spark/src/main/java/com/linkedin/coral/spark/transformers/ExtractUnionFunctionTransformer.java @@ -1,22 +1,31 @@ /** - * Copyright 2023 LinkedIn Corporation. All rights reserved. + * Copyright 2023-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ package com.linkedin.coral.spark.transformers; import java.net.URI; +import java.util.ArrayList; import java.util.List; import java.util.Set; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.sql.SqlCall; +import org.apache.calcite.sql.SqlLiteral; import org.apache.calcite.sql.SqlNode; import org.apache.calcite.sql.SqlNumericLiteral; import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.parser.SqlParserPos; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import com.linkedin.coral.com.google.common.collect.ImmutableList; +import com.linkedin.coral.common.TypeConverter; import com.linkedin.coral.common.transformers.SqlCallTransformer; +import com.linkedin.coral.common.utils.RelDataTypeToHiveTypeStringConverter; +import com.linkedin.coral.common.utils.TypeDerivationUtil; import com.linkedin.coral.hive.hive2rel.functions.CoalesceStructUtility; import com.linkedin.coral.spark.containers.SparkUDFInfo; @@ -30,14 +39,34 @@ * See {@link CoalesceStructUtility#COALESCE_STRUCT_FUNCTION_RETURN_STRATEGY} and its comments for more details. * * Check `CoralSparkTest#testUnionExtractUDF` for examples. + * + * Note that there is a Spark-specific mechanism that unwraps a single uniontype (a uniontype holding only one data type) + * to simply the single underlying data type. This behavior is specific during the Avro schema to Spark schema conversion + * in base tables. The problem with this behavior is we expect `coalesce_struct` to coalesce columns that originally contained + * single uniontypes, yet lose this information after Spark gets rid of the uniontype. To work around this, we retain information + * about the original schema and pass it to `coalesce_struct` UDF as a schema string. + * Reference: https://spark.apache.org/docs/latest/sql-data-sources-avro.html#supported-types-for-avro---spark-sql-conversion + * + * For example, if we have an input SqlNode like so, where `col` is a uniontype column holding only string type: + * "SELECT extract_union(col) FROM table" + * + * This transformer would transform the above SqlNode to: + * "SELECT coalesce_struct(col, 'uniontype<string>') FROM table" + * + * Check `CoralSparkTest#testUnionExtractUDFOnSingleTypeUnions` for more examples including examples where we have single + * uniontypes nested in a struct. + * */ public class ExtractUnionFunctionTransformer extends SqlCallTransformer { private static final String EXTRACT_UNION = "extract_union"; private static final String COALESCE_STRUCT = "coalesce_struct"; private final Set sparkUDFInfos; + private static final RelDataTypeToHiveTypeStringConverter hiveTypeStringConverter = + new RelDataTypeToHiveTypeStringConverter(true); - public ExtractUnionFunctionTransformer(Set sparkUDFInfos) { + public ExtractUnionFunctionTransformer(TypeDerivationUtil typeDerivationUtil, Set sparkUDFInfos) { + super(typeDerivationUtil); this.sparkUDFInfos = sparkUDFInfos; } @@ -56,6 +85,18 @@ protected SqlCall transform(SqlCall sqlCall) { createSqlOperator(COALESCE_STRUCT, CoalesceStructUtility.COALESCE_STRUCT_FUNCTION_RETURN_STRATEGY); if (operandList.size() == 1) { // one arg case: extract_union(field_name) + RelDataType operandType = deriveRelDatatype(sqlCall.operand(0)); + + if (containsSingleUnionType(operandType)) { + // Pass in schema string to keep track of the original Hive schema containing single uniontypes so coalesce_struct + // UDF knows which fields are unwrapped single uniontypes. This is needed otherwise coalesce_struct would + // not coalesce the single uniontype fields as expected. + String operandSchemaString = hiveTypeStringConverter.convertRelDataType(deriveRelDatatype(sqlCall.operand(0))); + List newOperandList = new ArrayList<>(operandList); + newOperandList.add(SqlLiteral.createCharString(operandSchemaString, SqlParserPos.ZERO)); + return coalesceStructFunction.createCall(sqlCall.getParserPosition(), newOperandList); + } + return coalesceStructFunction.createCall(sqlCall.getParserPosition(), operandList); } else if (operandList.size() == 2) { // two arg case: extract_union(field_name, ordinal) @@ -66,4 +107,41 @@ protected SqlCall transform(SqlCall sqlCall) { return sqlCall; } } + + private boolean containsSingleUnionType(RelDataType relDataType) { + if (isSingleUnionType(relDataType)) { + return true; + } + + // Recursive case: if the current type is a struct, map or collection, check its nested types + if (relDataType.isStruct()) { + for (RelDataTypeField field : relDataType.getFieldList()) { + if (containsSingleUnionType(field.getType())) { + return true; + } + } + } else if (relDataType.getKeyType() != null) { + // For map type, check both key and value types + if (containsSingleUnionType(relDataType.getKeyType()) || containsSingleUnionType(relDataType.getValueType())) { + return true; + } + } else if (relDataType.getComponentType() != null) { + // For collection type, check the component type + if (containsSingleUnionType(relDataType.getComponentType())) { + return true; + } + } + + return false; + } + + /** + * Check if the given RelDataType is a single union type in Coral IR representation, the conversion to which happens in + * {@link TypeConverter#convert(UnionTypeInfo, RelDataTypeFactory)} + */ + private boolean isSingleUnionType(RelDataType relDataType) { + return relDataType.isStruct() && relDataType.getFieldList().size() == 2 + && relDataType.getFieldList().get(0).getKey().equalsIgnoreCase("tag") + && relDataType.getFieldList().get(1).getKey().equalsIgnoreCase("field0"); + } } diff --git a/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java b/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java index 666ea87e3..454ea0ac7 100644 --- a/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java +++ b/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -421,6 +421,30 @@ public void testUnionExtractUDF() { assertEquals(createCoralSpark(relNode2).getSparkSql(), targetSql2); } + @Test + public void testUnionExtractUDFOnSingleTypeUnions() { + RelNode relNode = TestUtils.toRelNode("SELECT extract_union(bar) from union_table"); + String targetSql = "SELECT coalesce_struct(union_table.bar, 'uniontype>')\n" + + "FROM default.union_table union_table"; + assertEquals(createCoralSpark(relNode).getSparkSql(), targetSql); + + RelNode relNode1 = TestUtils.toRelNode("SELECT extract_union(baz) from union_table"); + String targetSql1 = "SELECT coalesce_struct(union_table.baz, 'struct>>')\n" + + "FROM default.union_table union_table"; + assertEquals(createCoralSpark(relNode1).getSparkSql(), targetSql1); + + RelNode relNode2 = TestUtils.toRelNode("SELECT extract_union(bar).tag_0 from union_table"); + String targetSql2 = "SELECT coalesce_struct(union_table.bar, 'uniontype>').tag_0\n" + + "FROM default.union_table union_table"; + assertEquals(createCoralSpark(relNode2).getSparkSql(), targetSql2); + + RelNode relNode3 = TestUtils.toRelNode("SELECT extract_union(baz).single.tag_0 from union_table"); + String targetSql4 = + "SELECT (coalesce_struct(union_table.baz, 'struct>>').single).tag_0\n" + + "FROM default.union_table union_table"; + assertEquals(createCoralSpark(relNode3).getSparkSql(), targetSql4); + } + @Test public void testDateFunction() { RelNode relNode = TestUtils.toRelNode("SELECT date('2021-01-02') as a FROM foo"); diff --git a/coral-spark/src/test/java/com/linkedin/coral/spark/TestUtils.java b/coral-spark/src/test/java/com/linkedin/coral/spark/TestUtils.java index 845e4ba39..7f4ae0617 100644 --- a/coral-spark/src/test/java/com/linkedin/coral/spark/TestUtils.java +++ b/coral-spark/src/test/java/com/linkedin/coral/spark/TestUtils.java @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -223,7 +223,7 @@ public static void initializeViews(HiveConf conf) throws HiveException, MetaExce run(driver, String.join("\n", "", "ALTER TABLE schema_promotion CHANGE COLUMN b b array")); run(driver, - "CREATE TABLE IF NOT EXISTS union_table(foo uniontype, struct>)"); + "CREATE TABLE IF NOT EXISTS union_table(foo uniontype, struct>, bar uniontype>, baz struct>>)"); run(driver, "CREATE TABLE IF NOT EXISTS nested_union(a uniontype, b:int>>)"); From 8e03f4cc74f7ede855ee82f99af3d69b0fdab8a9 Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 31 Jul 2024 17:12:00 -0400 Subject: [PATCH 08/12] [Coral-Trino] Use CoralSqlDialect during CoralRelNode to CoralSqlNode conversion (#521) * temp * temp * use coralsqldialect in trino reltosql * Import class not member --- .../linkedin/coral/trino/rel2trino/RelToTrinoConverter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/RelToTrinoConverter.java b/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/RelToTrinoConverter.java index 9ebc2cf93..a2b3c6145 100644 --- a/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/RelToTrinoConverter.java +++ b/coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/RelToTrinoConverter.java @@ -32,6 +32,7 @@ import com.linkedin.coral.common.HiveMetastoreClient; import com.linkedin.coral.common.functions.CoralSqlUnnestOperator; import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; +import com.linkedin.coral.transformers.CoralRelToSqlNodeConverter; import static com.google.common.base.Preconditions.*; import static com.linkedin.coral.trino.rel2trino.CoralTrinoConfigKeys.*; @@ -60,7 +61,7 @@ public class RelToTrinoConverter extends RelToSqlConverter { * @param mscClient client interface used to interact with the Hive Metastore service. */ public RelToTrinoConverter(HiveMetastoreClient mscClient) { - super(TrinoSqlDialect.INSTANCE); + super(CoralRelToSqlNodeConverter.INSTANCE); _hiveMetastoreClient = mscClient; } @@ -70,7 +71,7 @@ public RelToTrinoConverter(HiveMetastoreClient mscClient) { * @param configs configs */ public RelToTrinoConverter(HiveMetastoreClient mscClient, Map configs) { - super(TrinoSqlDialect.INSTANCE); + super(CoralRelToSqlNodeConverter.INSTANCE); checkNotNull(configs); this.configs = configs; _hiveMetastoreClient = mscClient; From 8786af3d534d3dc575229092d7774f2d7d2a969b Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 31 Jul 2024 18:26:17 -0400 Subject: [PATCH 09/12] [Coral-Schema] Fix incorrect type derivation for repeated field reference on UDF calls (#510) * fix field reference bug in coral schema * bring back override tag * address nits --- .../linkedin/coral/common/TypeConverter.java | 1 - .../schema/avro/RelToAvroSchemaConverter.java | 84 +++++++++++-------- .../linkedin/coral/schema/avro/TestUtils.java | 3 + .../avro/ViewToAvroSchemaConverterTests.java | 13 ++- ...estSingleUnionFieldReference-expected.avsc | 12 +++ 5 files changed, 74 insertions(+), 39 deletions(-) create mode 100644 coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc diff --git a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java index 022538eaa..d1dc558af 100644 --- a/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java +++ b/coral-common/src/main/java/com/linkedin/coral/common/TypeConverter.java @@ -140,7 +140,6 @@ public static RelDataType convert(StructTypeInfo structType, final RelDataTypeFa // The schema of output Struct conforms to https://github.com/trinodb/trino/pull/3483 // except we adopted "integer" for the type of "tag" field instead of "tinyint" in the Trino patch // for compatibility with other platforms that Iceberg currently doesn't support tinyint type. - // When the field count inside UnionTypeInfo is one, we surface the underlying RelDataType instead. // Note: this is subject to change in the future pending on the discussion in // https://mail-archives.apache.org/mod_mbox/iceberg-dev/202112.mbox/browser diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index 01109a372..a7165a779 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -495,48 +495,58 @@ public RexNode visitRangeRef(RexRangeRef rexRangeRef) { public RexNode visitFieldAccess(RexFieldAccess rexFieldAccess) { RexNode referenceExpr = rexFieldAccess.getReferenceExpr(); - if (referenceExpr instanceof RexCall - && ((RexCall) referenceExpr).getOperator() instanceof SqlUserDefinedFunction) { - String oldFieldName = rexFieldAccess.getField().getName(); - String suggestNewFieldName = suggestedFieldNames.poll(); - String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); - - RelDataType fieldType = rexFieldAccess.getType(); - boolean isNullable = SchemaUtilities.isFieldNullable((RexCall) referenceExpr, inputSchema); - // TODO: add field documentation - SchemaUtilities.appendField(newFieldName, fieldType, null, fieldAssembler, isNullable); - } else { - Deque innerRecordNames = new LinkedList<>(); - while (!(referenceExpr instanceof RexInputRef)) { - if (referenceExpr instanceof RexCall - && ((RexCall) referenceExpr).getOperator().getName().equalsIgnoreCase("ITEM")) { - // While selecting `int_field` from `array_col:array>` using `array_col[x].int_field`, - // `rexFieldAccess` is like `ITEM($1, 1).int_field`, we need to set `referenceExpr` to be the first operand (`$1`) of `ITEM` function - referenceExpr = ((RexCall) referenceExpr).getOperands().get(0); - } else if (referenceExpr instanceof RexFieldAccess) { - // While selecting `int_field` from `struct_col:struct>` using `struct_col.inner_struct_col.int_field`, - // `rexFieldAccess` is like `$3.inner_struct_col.int_field`, we need to set `referenceExpr` to be the expr (`$3`) of itself. - // Besides, we need to store the field name (`inner_struct_col`) in `fieldNames` so that we can retrieve the correct inner struct from `topSchema` afterwards - innerRecordNames.push(((RexFieldAccess) referenceExpr).getField().getName()); - referenceExpr = ((RexFieldAccess) referenceExpr).getReferenceExpr(); - } else { - return super.visitFieldAccess(rexFieldAccess); - } + Deque innerRecordNames = new LinkedList<>(); + while (!(referenceExpr instanceof RexInputRef)) { + if (referenceExpr instanceof RexCall + && ((RexCall) referenceExpr).getOperator().getName().equalsIgnoreCase("ITEM")) { + // While selecting `int_field` from `array_col:array>` using `array_col[x].int_field`, + // `rexFieldAccess` is like `ITEM($1, 1).int_field`, we need to set `referenceExpr` to be the first operand (`$1`) of `ITEM` function + referenceExpr = ((RexCall) referenceExpr).getOperands().get(0); + } else if (referenceExpr instanceof RexCall + && ((RexCall) referenceExpr).getOperator() instanceof SqlUserDefinedFunction) { + // UDFs calls could potentially be doubly (or more) field-referenced, for example, `extract_union(baz).single.tag_0` + // where baz is a struct containing a uniontype field. In this case, we simply need to use derived type of the entire + // call. Note that this also takes care of the simple one layer field reference on a UDF call. + handleUDFFieldAccess(rexFieldAccess, (RexCall) referenceExpr); + return rexFieldAccess; + } else if (referenceExpr instanceof RexFieldAccess) { + // While selecting `int_field` from `struct_col:struct>` using `struct_col.inner_struct_col.int_field`, + // `rexFieldAccess` is like `$3.inner_struct_col.int_field`, we need to set `referenceExpr` to be the expr (`$3`) of itself. + // Besides, we need to store the field name (`inner_struct_col`) in `fieldNames` so that we can retrieve the correct inner struct from `topSchema` afterwards + innerRecordNames.push(((RexFieldAccess) referenceExpr).getField().getName()); + referenceExpr = ((RexFieldAccess) referenceExpr).getReferenceExpr(); + } else { + return super.visitFieldAccess(rexFieldAccess); } - - String oldFieldName = rexFieldAccess.getField().getName(); - String suggestNewFieldName = suggestedFieldNames.poll(); - String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); - Schema topSchema = inputSchema.getFields().get(((RexInputRef) referenceExpr).getIndex()).schema(); - - Schema.Field accessedField = getFieldFromTopSchema(topSchema, oldFieldName, innerRecordNames); - assert accessedField != null; - SchemaUtilities.appendField(newFieldName, accessedField, fieldAssembler); } + handleFieldAccess(rexFieldAccess, (RexInputRef) referenceExpr, innerRecordNames); return rexFieldAccess; } + private void handleFieldAccess(RexFieldAccess rexFieldAccess, RexInputRef referenceExpr, + Deque innerRecordNames) { + String oldFieldName = rexFieldAccess.getField().getName(); + String suggestNewFieldName = suggestedFieldNames.poll(); + String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); + + Schema topSchema = inputSchema.getFields().get(referenceExpr.getIndex()).schema(); + Schema.Field accessedField = getFieldFromTopSchema(topSchema, oldFieldName, innerRecordNames); + assert accessedField != null; + SchemaUtilities.appendField(newFieldName, accessedField, fieldAssembler); + } + + private void handleUDFFieldAccess(RexFieldAccess rexFieldAccess, RexCall referenceExpr) { + String oldFieldName = rexFieldAccess.getField().getName(); + String suggestNewFieldName = suggestedFieldNames.poll(); + String newFieldName = SchemaUtilities.getFieldName(oldFieldName, suggestNewFieldName); + + RelDataType fieldType = rexFieldAccess.getType(); + boolean isNullable = SchemaUtilities.isFieldNullable(referenceExpr, inputSchema); + // TODO: add field documentation + SchemaUtilities.appendField(newFieldName, fieldType, null, fieldAssembler, isNullable); + } + @Override public RexNode visitSubQuery(RexSubQuery rexSubQuery) { // TODO: implement this method diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java index 56642a828..1b0f2bc3a 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java @@ -144,6 +144,9 @@ private static void initializeTables() { executeQuery("DROP TABLE IF EXISTS basedecimal"); executeQuery("CREATE TABLE IF NOT EXISTS basedecimal(decimal_col decimal(2,1))"); + + executeQuery( + "CREATE TABLE IF NOT EXISTS single_uniontypes(single uniontype, struct_col struct>)"); } private static void initializeUdfs() { diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java index 4bf8130c5..d13046229 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -560,6 +560,17 @@ public void testNullabliltyExtractUnionUDF() { Assert.assertEquals(actual.toString(true), TestUtils.loadSchema("testNullabilityExtractUnionUDF-expected.avsc")); } + @Test + public void testSingleUnionFieldReference() { + String sql = + "select extract_union(struct_col).single.tag_0 as single_in_struct, extract_union(single).tag_0 as single from single_uniontypes"; + ViewToAvroSchemaConverter viewToAvroSchemaConverter = ViewToAvroSchemaConverter.create(hiveMetastoreClient); + + Schema actual = viewToAvroSchemaConverter.toAvroSchema(sql); + + Assert.assertEquals(actual.toString(true), TestUtils.loadSchema("testSingleUnionFieldReference-expected.avsc")); + } + @Test(enabled = false) public void testRenameToLowercase() { String viewSql = "CREATE VIEW v AS " + "SELECT bc.Id AS id, bc.Array_Col AS array_col " + "FROM basecomplex bc " diff --git a/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc b/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc new file mode 100644 index 000000000..d68d1a248 --- /dev/null +++ b/coral-schema/src/test/resources/testSingleUnionFieldReference-expected.avsc @@ -0,0 +1,12 @@ +{ + "type" : "record", + "name" : "SingleUniontypes", + "namespace" : "default.single_uniontypes", + "fields" : [ { + "name" : "single_in_struct", + "type" : [ "null", "string" ] + }, { + "name" : "single", + "type" : [ "null", "string" ] + } ] +} \ No newline at end of file From 08c7a089c917637340efa069a68f7fb8cb5c615d Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Tue, 11 Jun 2024 20:00:47 -0400 Subject: [PATCH 10/12] spotless --- .../linkedin/coral/schema/avro/RelToAvroSchemaConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index a7165a779..d1816a877 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2023 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ From bc555dc0ad8af35ddd5a81b63be9eb8a88b49cde Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Thu, 11 Jul 2024 17:56:40 -0400 Subject: [PATCH 11/12] test if we need functionfieldref --- .../hive/hive2rel/CoralConvertletTable.java | 16 +++++++--------- .../hive2rel/parsetree/ParseTreeBuilder.java | 3 +-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java index 774e81d0f..c418ca5a9 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/CoralConvertletTable.java @@ -16,8 +16,6 @@ import org.apache.calcite.sql2rel.SqlRexConvertlet; import org.apache.calcite.sql2rel.StandardConvertletTable; -import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; - /** * ConvertletTable for transformations only relevant to Coral's Intermediate Representation, not specific @@ -26,13 +24,13 @@ */ public class CoralConvertletTable extends ReflectiveConvertletTable { - @SuppressWarnings("unused") - public RexNode convertFunctionFieldReferenceOperator(SqlRexContext cx, FunctionFieldReferenceOperator op, - SqlCall call) { - RexNode funcExpr = cx.convertExpression(call.operand(0)); - String fieldName = FunctionFieldReferenceOperator.fieldNameStripQuotes(call.operand(1)); - return cx.getRexBuilder().makeFieldAccess(funcExpr, fieldName, false); - } + // @SuppressWarnings("unused") + // public RexNode convertFunctionFieldReferenceOperator(SqlRexContext cx, FunctionFieldReferenceOperator op, + // SqlCall call) { + // RexNode funcExpr = cx.convertExpression(call.operand(0)); + // String fieldName = FunctionFieldReferenceOperator.fieldNameStripQuotes(call.operand(1)); + // return cx.getRexBuilder().makeFieldAccess(funcExpr, fieldName, false); + // } /** * Override {@link StandardConvertletTable#convertCast} to avoid cast optimizations that remove the cast. diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java index 8bf1ac4a0..079442f36 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/parsetree/ParseTreeBuilder.java @@ -48,7 +48,6 @@ import com.linkedin.coral.com.google.common.collect.Iterables; import com.linkedin.coral.common.functions.CoralSqlUnnestOperator; import com.linkedin.coral.common.functions.Function; -import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; import com.linkedin.coral.hive.hive2rel.functions.HiveFunctionResolver; import com.linkedin.coral.hive.hive2rel.functions.HiveJsonTupleOperator; import com.linkedin.coral.hive.hive2rel.functions.HiveRLikeOperator; @@ -523,7 +522,7 @@ protected SqlNode visitDotOperator(ASTNode node, ParseContext ctx) { Iterable names = Iterables.concat(left.names, right.names); return new SqlIdentifier(ImmutableList.copyOf(names), ZERO); } else { - return FunctionFieldReferenceOperator.DOT.createCall(ZERO, sqlNodes); + return SqlStdOperatorTable.DOT.createCall(ZERO, sqlNodes); } } From 7602c2e58c816b9971a1f7eae0773ee7e16b199c Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Fri, 12 Jul 2024 17:50:54 -0400 Subject: [PATCH 12/12] replace FunctionFieldReferenceOperator with SqlStdOperator DOT operator --- .../FunctionFieldReferenceOperator.java | 110 ------------------ .../coral/hive/hive2rel/HiveSqlValidator.java | 16 +-- .../CoralRelToSqlNodeConverter.java | 9 +- .../linkedin/coral/spark/CoralSparkTest.java | 10 +- 4 files changed, 15 insertions(+), 130 deletions(-) delete mode 100644 coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java diff --git a/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java b/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java deleted file mode 100644 index f947da078..000000000 --- a/coral-common/src/main/java/com/linkedin/coral/common/functions/FunctionFieldReferenceOperator.java +++ /dev/null @@ -1,110 +0,0 @@ -/** - * Copyright 2018-2024 LinkedIn Corporation. All rights reserved. - * Licensed under the BSD-2 Clause license. - * See LICENSE in the project root for license information. - */ -package com.linkedin.coral.common.functions; - -import com.google.common.base.Preconditions; - -import org.apache.calcite.rel.type.RelDataType; -import org.apache.calcite.sql.SqlBasicCall; -import org.apache.calcite.sql.SqlBinaryOperator; -import org.apache.calcite.sql.SqlCall; -import org.apache.calcite.sql.SqlCharStringLiteral; -import org.apache.calcite.sql.SqlIdentifier; -import org.apache.calcite.sql.SqlKind; -import org.apache.calcite.sql.SqlLiteral; -import org.apache.calcite.sql.SqlNode; -import org.apache.calcite.sql.SqlWriter; -import org.apache.calcite.sql.parser.SqlParserPos; -import org.apache.calcite.sql.type.OperandTypes; -import org.apache.calcite.sql.util.SqlBasicVisitor; -import org.apache.calcite.sql.util.SqlVisitor; -import org.apache.calcite.sql.validate.SqlValidator; -import org.apache.calcite.sql.validate.SqlValidatorScope; - - -/** - * Operator to reference fields of structs returned by SQL functions. - * This supports following SQL: - * {@code - * SELECT f(col_1, col_2).field_a FROM myTable - * } - * where {@code f} is a function that returns a ROW type containing {@code field_a}. - * - * TODO: Fix calcite and fold this into Calcite DOT operator - * - */ -public class FunctionFieldReferenceOperator extends SqlBinaryOperator { - public static final FunctionFieldReferenceOperator DOT = new FunctionFieldReferenceOperator(); - - public FunctionFieldReferenceOperator() { - super(".", SqlKind.DOT, 80, true, null, null, OperandTypes.ANY_ANY); - } - - @Override - public SqlCall createCall(SqlLiteral functionQualifier, SqlParserPos pos, SqlNode... operands) { - Preconditions.checkState(operands.length == 2); - SqlCharStringLiteral fieldName = SqlLiteral.createCharString(fieldNameStripQuotes(operands[1]), SqlParserPos.ZERO); - return super.createCall(functionQualifier, pos, operands[0], fieldName); - } - - @Override - public void acceptCall(SqlVisitor visitor, SqlCall call, boolean onlyExpressions, - SqlBasicVisitor.ArgHandler argHandler) { - argHandler.visitChild(visitor, call, 0, call.operand(0)); - } - - @Override - public void unparse(SqlWriter writer, SqlCall call, int leftPrec, int rightPrec) { - call.operand(0).unparse(writer, getLeftPrec(), getRightPrec()); - writer.literal("."); - writer.setNeedWhitespace(false); - // strip quotes from fieldName - String fieldName = fieldNameStripQuotes(call.operand(1)); - writer.identifier(fieldName, true); - } - - @Override - public RelDataType deriveType(SqlValidator validator, SqlValidatorScope scope, SqlCall call) { - SqlNode firstOperand = call.operand(0); - if (firstOperand instanceof SqlBasicCall) { - RelDataType funcType = validator.deriveType(scope, firstOperand); - if (funcType.isStruct()) { - return funcType.getField(fieldNameStripQuotes(call.operand(1)), false, false).getType(); - } - } - return super.deriveType(validator, scope, call); - } - - @Override - public void validateCall(SqlCall call, SqlValidator validator, SqlValidatorScope scope, - SqlValidatorScope operandScope) { - call.operand(0).validateExpr(validator, operandScope); - } - - public static String fieldNameStripQuotes(SqlNode node) { - return stripQuotes(fieldName(node)); - } - - public static String fieldName(SqlNode node) { - switch (node.getKind()) { - case IDENTIFIER: - return ((SqlIdentifier) node).getSimple(); - case LITERAL: - return ((SqlLiteral) node).toValue(); - default: - throw new IllegalStateException( - String.format("Unknown operand type %s to reference a field, operand: %s", node.getKind(), node)); - } - } - - private static String stripQuotes(String id) { - if ((id.startsWith("'") && id.endsWith("'")) || (id.startsWith("\"") && id.endsWith("\""))) { - return id.substring(1, id.length() - 1); - } - return id; - } - -} diff --git a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlValidator.java b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlValidator.java index 66ce83bb5..3736c268c 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlValidator.java +++ b/coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/HiveSqlValidator.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -9,7 +9,6 @@ import org.apache.calcite.config.NullCollation; import org.apache.calcite.prepare.CalciteCatalogReader; import org.apache.calcite.rel.type.RelDataType; -import org.apache.calcite.sql.SqlBasicCall; import org.apache.calcite.sql.SqlInsert; import org.apache.calcite.sql.SqlNode; import org.apache.calcite.sql.SqlOperatorTable; @@ -19,8 +18,6 @@ import org.apache.calcite.sql.validate.SqlValidatorImpl; import org.apache.calcite.sql.validate.SqlValidatorScope; -import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; - public class HiveSqlValidator extends SqlValidatorImpl { @@ -51,15 +48,4 @@ protected void inferUnknownTypes(RelDataType inferredType, SqlValidatorScope sco super.inferUnknownTypes(inferredType, scope, node); } - @Override - public SqlNode expand(SqlNode expr, SqlValidatorScope scope) { - if (expr instanceof SqlBasicCall - && ((SqlBasicCall) expr).getOperator().equals(FunctionFieldReferenceOperator.DOT)) { - SqlBasicCall dotCall = (SqlBasicCall) expr; - if (dotCall.operand(0) instanceof SqlBasicCall) { - return expr; - } - } - return super.expand(expr, scope); - } } diff --git a/coral-hive/src/main/java/com/linkedin/coral/transformers/CoralRelToSqlNodeConverter.java b/coral-hive/src/main/java/com/linkedin/coral/transformers/CoralRelToSqlNodeConverter.java index 111bfcaa7..456f929b0 100644 --- a/coral-hive/src/main/java/com/linkedin/coral/transformers/CoralRelToSqlNodeConverter.java +++ b/coral-hive/src/main/java/com/linkedin/coral/transformers/CoralRelToSqlNodeConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 LinkedIn Corporation. All rights reserved. + * Copyright 2017-2024 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -49,7 +49,6 @@ import com.linkedin.coral.com.google.common.collect.ImmutableList; import com.linkedin.coral.com.google.common.collect.ImmutableMap; import com.linkedin.coral.common.functions.CoralSqlUnnestOperator; -import com.linkedin.coral.common.functions.FunctionFieldReferenceOperator; /** @@ -474,7 +473,7 @@ private SqlNode generateRightChildForSqlJoinWithLateralViews(BiRel e, Result rig * Calcite converts it to a {@link SqlIdentifier} with {@link SqlIdentifier#names} as ["f(x)", "y"] where "f(x)" and "y" are String, * which is opaque and not aligned with our expectation, since we want to apply transformations on `f(x)` with * {@link com.linkedin.coral.common.transformers.SqlCallTransformer}. Therefore, we override this - * method to convert `f(x)` to {@link SqlCall} and `.` to {@link com.linkedin.coral.common.functions.FunctionFieldReferenceOperator#DOT}. + * method to convert `f(x)` to {@link SqlCall} and `.` to {@link SqlStdOperatorTable#DOT}. * * With this override, the converted CoralSqlNode matches the previous SqlNode handed over to Calcite for validation and conversion * in `HiveSqlToRelConverter#convertQuery`. @@ -500,7 +499,9 @@ public SqlNode toSql(RexProgram program, RexNode rex) { SqlNode functionCall = toSql(program, referencedExpr); Collections.reverse(accessNames); for (String accessName : accessNames) { - functionCall = FunctionFieldReferenceOperator.DOT.createCall(SqlParserPos.ZERO, functionCall, + // functionCall = FunctionFieldReferenceOperator.DOT.createCall(SqlParserPos.ZERO, functionCall, + // new SqlIdentifier(accessName, POS)); + functionCall = SqlStdOperatorTable.DOT.createCall(SqlParserPos.ZERO, functionCall, new SqlIdentifier(accessName, POS)); } return functionCall; diff --git a/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java b/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java index 454ea0ac7..ddc7158e3 100644 --- a/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java +++ b/coral-spark/src/test/java/com/linkedin/coral/spark/CoralSparkTest.java @@ -876,9 +876,17 @@ public void testSelectStar() { @Test public void testConvertFieldAccessOnFunctionCall() { + RelNode relNode = TestUtils.toRelNode("SELECT named_struct('a', 1).a"); + + String targetSql = "SELECT named_struct('a', 1).a\n" + "FROM (VALUES (0)) t (ZERO)"; + assertEquals(createCoralSpark(relNode).getSparkSql(), targetSql); + } + + @Test + public void testConvertNestedFieldAccessOnFunctionCall() { RelNode relNode = TestUtils.toRelNode("SELECT named_struct('a', named_struct('b', 1)).a.b"); - String targetSql = "SELECT (named_struct('a', named_struct('b', 1)).a).b\n" + "FROM (VALUES (0)) t (ZERO)"; + String targetSql = "SELECT named_struct('a', named_struct('b', 1)).a.b\n" + "FROM (VALUES (0)) t (ZERO)"; assertEquals(createCoralSpark(relNode).getSparkSql(), targetSql); }