Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[enchement](utf8)import enable_text_validate_utf8 session var (#45537) #46070

Merged
merged 2 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions be/src/util/utf8_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
return validate_utf8_naive(src, len);
}
#endif

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) {
if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) {
return true;
}
return validate_utf8(src, len);
}
} // namespace doris
4 changes: 4 additions & 0 deletions be/src/util/utf8_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#pragma once

#include <gen_cpp/PlanNodes_types.h>

#include <cstddef>

namespace doris {
Expand All @@ -25,4 +27,6 @@ namespace doris {
bool validate_utf8(const char* src, size_t len);
// check utf8 use naive c++
bool validate_utf8_naive(const char* data, size_t len);

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len);
} // namespace doris
6 changes: 3 additions & 3 deletions be/src/vec/exec/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block, std::vector<MutableColumnPtr>&
}

Status CsvReader::_validate_line(const Slice& line, bool* success) {
if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
if (!_is_load) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
} else {
Expand Down Expand Up @@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
return Status::InternalError<false>(
"The first line is empty, can not parse column numbers");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
Expand All @@ -968,7 +968,7 @@ Status CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
if (size == 0) {
return Status::InternalError<false>("The first line is empty, can not parse column names");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
CREATE TABLE invalid_utf8_data (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
location '/user/doris/preinstalled_data/text/utf8_check';


CREATE TABLE invalid_utf8_data2 (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
location '/user/doris/preinstalled_data/text/utf8_check';



msck repair table invalid_utf8_data;
msck repair table invalid_utf8_data2;

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,€,AAB,helloworld
2,À€,AAB,helloworld
2,à€€,AAB,helloworld
4,ð€€€,AAB,helloworld
5,ø€€€€,AAB,helloworld
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ protected TFileAttributes getFileAttributes() throws UserException {
TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) {
fileAttributes.setTrimDoubleQuotes(true);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,8 @@ public class SessionVariable implements Serializable, Writable {
*/
public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = "enable_auto_create_when_overwrite";

public static final String ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8";

/**
* If set false, user couldn't submit analyze SQL and FE won't allocate any related resources.
*/
Expand Down Expand Up @@ -2219,6 +2221,13 @@ public void setIgnoreShapePlanNodes(String ignoreShapePlanNodes) {
})
public boolean enableAutoCreateWhenOverwrite = false;

@VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, description = {
"对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
"For text type file reading, whether to enable utf8 encoding check."
+ "non-utf8 characters will be displayed as garbled characters."
})
public boolean enableTextValidateUtf8 = true;

@VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = true, description = {
"跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
"Skip checking transactional hive version file '_orc_acid_version.'"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ public TFileAttributes getFileAttributes() {
fileAttributes.setHeaderType(this.headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
fileAttributes.setJsonRoot(jsonRoot);
fileAttributes.setJsonpaths(jsonPaths);
Expand Down
2 changes: 2 additions & 0 deletions gensrc/thrift/PlanNodes.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ struct TFileAttributes {
10: optional bool trim_double_quotes;
// csv skip line num, only used when csv header_type is not set.
11: optional i32 skip_lines;
//For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF)
12: optional bool enable_text_validate_utf8 = true;
}

struct TIcebergDeleteFileDesc {
Expand Down
55 changes: 55 additions & 0 deletions regression-test/data/external_table_p0/hive/test_utf8_check.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

100 changes: 100 additions & 0 deletions regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.


suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
return;
}

for (String hivePrefix : ["hive2","hive3"]) {

String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String catalog_name = "${hivePrefix}_test_utf8_check"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
def hdfsUserName = "doris"
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"

sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (
"type"="hms",
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""


sql """ set enable_text_validate_utf8 = true; """

test {
sql """ select * from invalid_utf8_data """
exception """Only support csv data in utf8 codec"""
}


test {
sql """ select * from invalid_utf8_data2; """
exception """Only support csv data in utf8 codec"""
}


def uri = "${defaultFS}" + "/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"


test {
sql """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}

test {
sql """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}


sql """ set enable_text_validate_utf8 = false; """

qt_1 """select * from invalid_utf8_data order by id """

qt_2 """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""


qt_3 """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",") order by c1"""
qt_4 """select * from invalid_utf8_data2 order by id """


}

}
Loading