Skip to content

Commit

Permalink
[enchement](utf8)import enable_text_validate_utf8 session var (apache…
Browse files Browse the repository at this point in the history
…#45537)

Problem Summary:
When reading text format files in Hive catalog and TVF, sometimes you
may encounter the exception `Only support csv data in utf8 codec`.
I introduced a new session variable `enable_text_validate_utf8` to
control whether to check the utf8 format.

Introduced `enable_text_validate_utf8` session variable to control
whether to check the utf8 format.
  • Loading branch information
hubgeter committed Dec 27, 2024
1 parent 8d3e8fe commit 662ebbc
Show file tree
Hide file tree
Showing 11 changed files with 222 additions and 3 deletions.
7 changes: 7 additions & 0 deletions be/src/util/utf8_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
return validate_utf8_naive(src, len);
}
#endif

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) {
if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) {
return true;
}
return validate_utf8(src, len);
}
} // namespace doris
4 changes: 4 additions & 0 deletions be/src/util/utf8_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#pragma once

#include <gen_cpp/PlanNodes_types.h>

#include <cstddef>

namespace doris {
Expand All @@ -25,4 +27,6 @@ namespace doris {
bool validate_utf8(const char* src, size_t len);
// check utf8 use naive c++
bool validate_utf8_naive(const char* data, size_t len);

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len);
} // namespace doris
6 changes: 3 additions & 3 deletions be/src/vec/exec/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block, std::vector<MutableColumnPtr>&
}

Status CsvReader::_validate_line(const Slice& line, bool* success) {
if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
if (!_is_load) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
} else {
Expand Down Expand Up @@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
return Status::InternalError<false>(
"The first line is empty, can not parse column numbers");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
Expand All @@ -968,7 +968,7 @@ Status CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
if (size == 0) {
return Status::InternalError<false>("The first line is empty, can not parse column names");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
CREATE TABLE invalid_utf8_data (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
location '/user/doris/preinstalled_data/text/utf8_check';


CREATE TABLE invalid_utf8_data2 (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
location '/user/doris/preinstalled_data/text/utf8_check';
msck repair table invalid_utf8_data;
msck repair table invalid_utf8_data2;
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,�,AAB,helloworld
2,��,AAB,helloworld
2,���,AAB,helloworld
4,����,AAB,helloworld
5,�����,AAB,helloworld
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ protected TFileAttributes getFileAttributes() throws UserException {
HiveProperties.getEscapeDelimiter(table).ifPresent(d -> textParams.setEscape(d.getBytes()[0]));
// 6. set null format
textParams.setNullFormat(HiveProperties.getNullFormat(table));
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
// set set properties of OpenCSVSerde
// 1. set column separator
Expand All @@ -444,6 +446,8 @@ protected TFileAttributes getFileAttributes() throws UserException {
textParams.setEnclose(HiveProperties.getQuoteChar(table).getBytes()[0]);
// 4. set escape char
textParams.setEscape(HiveProperties.getEscapeChar(table).getBytes()[0]);
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else {
throw new UserException(
"unsupported hive table serde: " + serDeLib);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,8 @@ public class SessionVariable implements Serializable, Writable {
*/
public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = "enable_auto_create_when_overwrite";

public static final String ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8";

/**
* If set false, user couldn't submit analyze SQL and FE won't allocate any related resources.
*/
Expand Down Expand Up @@ -2219,6 +2221,13 @@ public void setIgnoreShapePlanNodes(String ignoreShapePlanNodes) {
})
public boolean enableAutoCreateWhenOverwrite = false;

@VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, description = {
"对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
"For text type file reading, whether to enable utf8 encoding check."
+ "non-utf8 characters will be displayed as garbled characters."
})
public boolean enableTextValidateUtf8 = true;

@VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = true, description = {
"跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
"Skip checking transactional hive version file '_orc_acid_version.'"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ public TFileAttributes getFileAttributes() {
fileAttributes.setHeaderType(this.headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
fileAttributes.setJsonRoot(jsonRoot);
fileAttributes.setJsonpaths(jsonPaths);
Expand Down
2 changes: 2 additions & 0 deletions gensrc/thrift/PlanNodes.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ struct TFileAttributes {
10: optional bool trim_double_quotes;
// csv skip line num, only used when csv header_type is not set.
11: optional i32 skip_lines;
//For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF)
12: optional bool enable_text_validate_utf8 = true;
}

struct TIcebergDeleteFileDesc {
Expand Down
55 changes: 55 additions & 0 deletions regression-test/data/external_table_p0/hive/test_utf8_check.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

100 changes: 100 additions & 0 deletions regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.


suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
return;
}

for (String hivePrefix : ["hive2","hive3"]) {

String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String catalog_name = "${hivePrefix}_test_utf8_check"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
def hdfsUserName = "doris"
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"

sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (
"type"="hms",
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""


sql """ set enable_text_validate_utf8 = true; """

test {
sql """ select * from invalid_utf8_data """
exception """Only support csv data in utf8 codec"""
}


test {
sql """ select * from invalid_utf8_data2; """
exception """Only support csv data in utf8 codec"""
}


def uri = "${defaultFS}" + "/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"


test {
sql """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}

test {
sql """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}


sql """ set enable_text_validate_utf8 = false; """

qt_1 """select * from invalid_utf8_data order by id """

qt_2 """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""


qt_3 """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",") order by c1"""
qt_4 """select * from invalid_utf8_data2 order by id """


}

}

0 comments on commit 662ebbc

Please sign in to comment.