From 0d1adea17e61fd49a330b3fe2d5071404dc672e8 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Wed, 7 Aug 2024 17:30:07 +0800 Subject: [PATCH 1/2] [VL] Skip UTF-8 validation in JSON parsing (#6661) --- .../execution/ScalarFunctionsValidateSuite.scala | 14 +++++++++++--- ep/build-velox/src/modify_velox.patch | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala index 80fd72909f42..7c1033db6194 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala @@ -217,20 +217,28 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } - test("Test get_json_object datatab function") { + test("get_json_object") { runQueryAndCompare( "SELECT get_json_object(string_field1, '$.a') " + "from datatab limit 1;") { checkGlutenOperatorMatch[ProjectExecTransformer] } - } - test("Test get_json_object lineitem function") { runQueryAndCompare( "SELECT l_orderkey, get_json_object('{\"a\":\"b\"}', '$.a') " + "from lineitem limit 1;") { checkGlutenOperatorMatch[ProjectExecTransformer] } + + // Invalid UTF-8 encoding. + spark.sql( + "CREATE TABLE t USING parquet SELECT concat('{\"a\": 2, \"'," + + " string(X'80'), '\": 3, \"c\": 100}') AS c1") + withTable("t") { + runQueryAndCompare("SELECT get_json_object(c1, '$.c') FROM t;") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } } ignore("json_array_length") { diff --git a/ep/build-velox/src/modify_velox.patch b/ep/build-velox/src/modify_velox.patch index 1cb352a259a3..8e688d8c56da 100644 --- a/ep/build-velox/src/modify_velox.patch +++ b/ep/build-velox/src/modify_velox.patch @@ -177,3 +177,14 @@ index 2cabfc29a..54329ce23 100644 add_library( velox_dwio_arrow_parquet_writer_test_lib +diff --git a/CMake/resolve_dependency_modules/simdjson.cmake b/CMake/resolve_dependency_modules/simdjson.cmake +index 69e7f2044..777eb5ec1 100644 +--- a/CMake/resolve_dependency_modules/simdjson.cmake ++++ b/CMake/resolve_dependency_modules/simdjson.cmake +@@ -29,4 +29,6 @@ FetchContent_Declare( + URL ${VELOX_SIMDJSON_SOURCE_URL} + URL_HASH ${VELOX_SIMDJSON_BUILD_SHA256_CHECKSUM}) + ++set(SIMDJSON_SKIPUTF8VALIDATION ON) ++ + FetchContent_MakeAvailable(simdjson) From a1f3198195454ff364b8f1f4b00b9d04a32355df Mon Sep 17 00:00:00 2001 From: Arnav Balyan <60175178+ArnavBalyan@users.noreply.github.com> Date: Fri, 9 Aug 2024 18:57:11 +0530 Subject: [PATCH 2/2] [VL] Fix high precision rounding (#6707) --- cpp/velox/operators/functions/Arithmetic.h | 11 +++++++---- .../expressions/GlutenMathExpressionsSuite.scala | 3 +++ .../expressions/GlutenMathExpressionsSuite.scala | 4 ++++ .../expressions/GlutenMathExpressionsSuite.scala | 3 +++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cpp/velox/operators/functions/Arithmetic.h b/cpp/velox/operators/functions/Arithmetic.h index 0474e1554981..7b4c9ae9db7c 100644 --- a/cpp/velox/operators/functions/Arithmetic.h +++ b/cpp/velox/operators/functions/Arithmetic.h @@ -17,6 +17,7 @@ #include #include #include +#include #include namespace gluten { @@ -38,14 +39,16 @@ struct RoundFunction { return number; } - double factor = std::pow(10, decimals); + // Using long double for high precision during intermediate calculations. + // TODO: Make this more efficient with Boost to support high arbitrary precision at runtime. + long double factor = std::pow(10.0L, static_cast(decimals)); static const TNum kInf = std::numeric_limits::infinity(); + if (number < 0) { - return (std::round(std::nextafter(number, -kInf) * factor * -1) / factor) * -1; + return static_cast((std::round(std::nextafter(number, -kInf) * factor * -1) / factor) * -1); } - return std::round(std::nextafter(number, kInf) * factor) / factor; + return static_cast(std::round(std::nextafter(number, kInf) * factor) / factor); } - template FOLLY_ALWAYS_INLINE void call(TInput& result, const TInput& a, const int32_t b = 0) { result = round(a, b); diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala index 54583547d057..765a64f91baf 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala @@ -121,6 +121,9 @@ class GlutenMathExpressionsSuite extends MathExpressionsSuite with GlutenTestsTr checkEvaluation(Round(-3.5, 0), -4.0) checkEvaluation(Round(-0.35, 1), -0.4) checkEvaluation(Round(-35, -1), -40) + checkEvaluation(Round(1.12345678901234567, 8), 1.12345679) + checkEvaluation(Round(-0.98765432109876543, 5), -0.98765) + checkEvaluation(Round(12345.67890123456789, 6), 12345.678901) checkEvaluation(BRound(2.5, 0), 2.0) checkEvaluation(BRound(3.5, 0), 4.0) checkEvaluation(BRound(-2.5, 0), -2.0) diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala index a60f0dce644b..122f8dc066af 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala @@ -249,6 +249,10 @@ class GlutenMathExpressionsSuite extends MathExpressionsSuite with GlutenTestsTr checkEvaluation(Round(-3.5, 0), -4.0) checkEvaluation(Round(-0.35, 1), -0.4) checkEvaluation(Round(-35, -1), -40) + checkEvaluation(Round(1.12345678901234567, 8), 1.12345679) + checkEvaluation(Round(-0.98765432109876543, 5), -0.98765) + checkEvaluation(Round(12345.67890123456789, 6), 12345.678901) + checkEvaluation(Round(-35, -1), -40) checkEvaluation(Round(BigDecimal("45.00"), -1), BigDecimal(50)) checkEvaluation(BRound(2.5, 0), 2.0) checkEvaluation(BRound(3.5, 0), 4.0) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala index e220924880c7..7308352e40c6 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala @@ -248,6 +248,9 @@ class GlutenMathExpressionsSuite extends MathExpressionsSuite with GlutenTestsTr checkEvaluation(BRound(-3.5, 0), -4.0) checkEvaluation(BRound(-0.35, 1), -0.4) checkEvaluation(BRound(-35, -1), -40) + checkEvaluation(Round(1.12345678901234567, 8), 1.12345679) + checkEvaluation(Round(-0.98765432109876543, 5), -0.98765) + checkEvaluation(Round(12345.67890123456789, 6), 12345.678901) checkEvaluation(BRound(BigDecimal("45.00"), -1), BigDecimal(40)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(2.5), Literal(0))), Decimal(2)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(3.5), Literal(0))), Decimal(3))