From 662ebbcd28d98b387da583d12bd6dff4a795bfec Mon Sep 17 00:00:00 2001 From: daidai Date: Thu, 26 Dec 2024 23:48:33 +0800 Subject: [PATCH] [enchement](utf8)import enable_text_validate_utf8 session var (#45537) Problem Summary: When reading text format files in Hive catalog and TVF, sometimes you may encounter the exception `Only support csv data in utf8 codec`. I introduced a new session variable `enable_text_validate_utf8` to control whether to check the utf8 format. Introduced `enable_text_validate_utf8` session variable to control whether to check the utf8 format. --- be/src/util/utf8_check.cpp | 7 ++ be/src/util/utf8_check.h | 4 + be/src/vec/exec/format/csv/csv_reader.cpp | 6 +- .../create_preinstalled_scripts/run72.hql | 31 ++++++ .../text/utf8_check/utf8_check_fail.csv | 5 + .../datasource/hive/source/HiveScanNode.java | 4 + .../org/apache/doris/qe/SessionVariable.java | 9 ++ .../ExternalFileTableValuedFunction.java | 2 + gensrc/thrift/PlanNodes.thrift | 2 + .../hive/test_utf8_check.out | 55 ++++++++++ .../hive/test_utf8_check.groovy | 100 ++++++++++++++++++ 11 files changed, 222 insertions(+), 3 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv create mode 100644 regression-test/data/external_table_p0/hive/test_utf8_check.out create mode 100644 regression-test/suites/external_table_p0/hive/test_utf8_check.groovy diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp index 5355b9014202bb..f90c27e5e915ac 100644 --- a/be/src/util/utf8_check.cpp +++ b/be/src/util/utf8_check.cpp @@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) { return validate_utf8_naive(src, len); } #endif + +bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) { + if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) { + return true; + } + return validate_utf8(src, len); +} } // namespace doris diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h index 4214e186b71508..7e9b7a2a9de6af 100644 --- a/be/src/util/utf8_check.h +++ b/be/src/util/utf8_check.h @@ -17,6 +17,8 @@ #pragma once +#include + #include namespace doris { @@ -25,4 +27,6 @@ namespace doris { bool validate_utf8(const char* src, size_t len); // check utf8 use naive c++ bool validate_utf8_naive(const char* data, size_t len); + +bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len); } // namespace doris diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 1fc3bbad294cd2..77a5b65d512ef2 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block, std::vector& } Status CsvReader::_validate_line(const Slice& line, bool* success) { - if (!_is_proto_format && !validate_utf8(line.data, line.size)) { + if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) { if (!_is_load) { return Status::InternalError("Only support csv data in utf8 codec"); } else { @@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) { return Status::InternalError( "The first line is empty, can not parse column numbers"); } - if (!validate_utf8(const_cast(reinterpret_cast(ptr)), size)) { + if (!validate_utf8(_params, const_cast(reinterpret_cast(ptr)), size)) { return Status::InternalError("Only support csv data in utf8 codec"); } ptr = _remove_bom(ptr, size); @@ -968,7 +968,7 @@ Status CsvReader::_parse_col_names(std::vector* col_names) { if (size == 0) { return Status::InternalError("The first line is empty, can not parse column names"); } - if (!validate_utf8(const_cast(reinterpret_cast(ptr)), size)) { + if (!validate_utf8(_params, const_cast(reinterpret_cast(ptr)), size)) { return Status::InternalError("Only support csv data in utf8 codec"); } ptr = _remove_bom(ptr, size); diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql new file mode 100644 index 00000000000000..1ab754b5042705 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql @@ -0,0 +1,31 @@ +CREATE TABLE invalid_utf8_data ( + id INT, + corrupted_data STRING, + string_data1 STRING, + string_data2 STRING +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +LINES TERMINATED BY '\n' +location '/user/doris/preinstalled_data/text/utf8_check'; + + +CREATE TABLE invalid_utf8_data2 ( + id INT, + corrupted_data STRING, + string_data1 STRING, + string_data2 STRING +) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ( + "separatorChar" = ",", + "quoteChar" = "\"", + "escapeChar" = "\\" +) +location '/user/doris/preinstalled_data/text/utf8_check'; + + + +msck repair table invalid_utf8_data; +msck repair table invalid_utf8_data2; + diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv new file mode 100644 index 00000000000000..391cd4936607cb --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv @@ -0,0 +1,5 @@ +1,,AAB,helloworld +2,,AAB,helloworld +2,,AAB,helloworld +4,,AAB,helloworld +5,,AAB,helloworld diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 02906494b03827..b30145e6c383e8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -434,6 +434,8 @@ protected TFileAttributes getFileAttributes() throws UserException { HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0])); // 6. set null format textParams.setNullFormat(HiveProperties.getNullFormat(table)); + fileAttributes.setEnableTextValidateUtf8( + ConnectContext.get().getSessionVariable().enableTextValidateUtf8); } else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) { // set set properties of OpenCSVSerde // 1. set column separator @@ -444,6 +446,8 @@ protected TFileAttributes getFileAttributes() throws UserException { textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]); // 4. set escape char textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]); + fileAttributes.setEnableTextValidateUtf8( + ConnectContext.get().getSessionVariable().enableTextValidateUtf8); } else { throw new UserException( "unsupported hive table serde: " + serDeLib); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index e9dd9ec5822a79..f996b53825774d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -665,6 +665,8 @@ public class SessionVariable implements Serializable, Writable { */ public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = "enable_auto_create_when_overwrite"; + public static final String ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8"; + /** * If set false, user couldn't submit analyze SQL and FE won't allocate any related resources. */ @@ -2219,6 +2221,13 @@ public void setIgnoreShapePlanNodes(String ignoreShapePlanNodes) { }) public boolean enableAutoCreateWhenOverwrite = false; + @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, description = { + "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。", + "For text type file reading, whether to enable utf8 encoding check." + + "non-utf8 characters will be displayed as garbled characters." + }) + public boolean enableTextValidateUtf8 = true; + @VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = true, description = { "跳过检查 transactional hive 版本文件 '_orc_acid_version.'", "Skip checking transactional hive version file '_orc_acid_version.'" diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 1f65921832bb7e..cb1a2d89c5d1a0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -304,6 +304,8 @@ public TFileAttributes getFileAttributes() { fileAttributes.setHeaderType(this.headerType); fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes); fileAttributes.setSkipLines(skipLines); + fileAttributes.setEnableTextValidateUtf8( + ConnectContext.get().getSessionVariable().enableTextValidateUtf8); } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) { fileAttributes.setJsonRoot(jsonRoot); fileAttributes.setJsonpaths(jsonPaths); diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 1b873787765b7d..7ccb12b3331bfa 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -284,6 +284,8 @@ struct TFileAttributes { 10: optional bool trim_double_quotes; // csv skip line num, only used when csv header_type is not set. 11: optional i32 skip_lines; + //For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF) + 12: optional bool enable_text_validate_utf8 = true; } struct TIcebergDeleteFileDesc { diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out b/regression-test/data/external_table_p0/hive/test_utf8_check.out new file mode 100644 index 00000000000000..7557e789d497f1 --- /dev/null +++ b/regression-test/data/external_table_p0/hive/test_utf8_check.out @@ -0,0 +1,55 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !1 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + +-- !2 -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE +c3 text Yes false \N NONE +c4 text Yes false \N NONE + +-- !3 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + +-- !4 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + +-- !1 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + +-- !2 -- +c1 text Yes false \N NONE +c2 text Yes false \N NONE +c3 text Yes false \N NONE +c4 text Yes false \N NONE + +-- !3 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + +-- !4 -- +1 � AAB helloworld +2 �� AAB helloworld +2 ��� AAB helloworld +4 ���� AAB helloworld +5 ����� AAB helloworld + diff --git a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy new file mode 100644 index 00000000000000..aa26fdede734f5 --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive") { + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive2","hive3"]) { + + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_test_utf8_check" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + def hdfsUserName = "doris" + String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort") + def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + + sql """ set enable_text_validate_utf8 = true; """ + + test { + sql """ select * from invalid_utf8_data """ + exception """Only support csv data in utf8 codec""" + } + + + test { + sql """ select * from invalid_utf8_data2; """ + exception """Only support csv data in utf8 codec""" + } + + + def uri = "${defaultFS}" + "/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv" + + + test { + sql """ desc function HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "csv", + "column_separator"=",")""" + exception """Only support csv data in utf8 codec""" + } + + test { + sql """select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "csv", + "column_separator"=",")""" + exception """Only support csv data in utf8 codec""" + } + + + sql """ set enable_text_validate_utf8 = false; """ + + qt_1 """select * from invalid_utf8_data order by id """ + + qt_2 """ desc function HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "csv", + "column_separator"=",")""" + + + qt_3 """select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "csv", + "column_separator"=",") order by c1""" + qt_4 """select * from invalid_utf8_data2 order by id """ + + + } + +} \ No newline at end of file