From e10128b2b163de874faf5793dee3b213cff5d83d Mon Sep 17 00:00:00 2001 From: Tiewei Fang Date: Wed, 18 Dec 2024 09:55:42 +0800 Subject: [PATCH] [fix](tvf) Tvf supports to parse the enclose character in csv files (#45407) ### What problem does this PR solve? Problem Summary: Tvf supports to parse the enclose character in csv files --- be/src/vec/exec/format/csv/csv_reader.cpp | 6 +- .../common/util/FileFormatConstants.java | 1 + .../ExternalFileTableValuedFunction.java | 15 ++++ .../external_table_p0/tvf/enclose_csv.csv | 6 ++ .../tvf/test_local_tvf_enclose.out | 15 ++++ .../tvf/test_local_tvf_enclose.groovy | 72 +++++++++++++++++++ 6 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/external_table_p0/tvf/enclose_csv.csv create mode 100644 regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out create mode 100644 regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 41ad9ad6019046..1fc3bbad294cd2 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -925,9 +925,13 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* is_parse_name) { _trim_tailing_spaces, _trim_double_quotes, _value_separator, _value_separator_length); } else { + // If we pass `_file_slot_descs.size() - 1` to EncloseCsvTextFieldSplitter, it will cause BE core dump + // because currently _file_slot_descs is an empty vector. + // The _file_slot_descs.size() is only used to reserve space, + // so it's ok to pass 0 to EncloseCsvLineReaderContext text_line_reader_ctx = std::make_shared( _line_delimiter, _line_delimiter_length, _value_separator, _value_separator_length, - _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr); + 0, _enclose, _escape, _keep_cr); _fields_splitter = std::make_unique( _trim_tailing_spaces, false, std::static_pointer_cast(text_line_reader_ctx), diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java index e86c06fcb6177d..b4f313c7822791 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java @@ -49,6 +49,7 @@ public class FileFormatConstants { public static final String PROP_CSV_SCHEMA = "csv_schema"; public static final String PROP_COMPRESS_TYPE = "compress_type"; public static final String PROP_PATH_PARTITION_KEYS = "path_partition_keys"; + public static final String PROP_ENCLOSE = "enclose"; // decimal(p,s) public static final Pattern DECIMAL_TYPE_PATTERN = Pattern.compile("decimal\\((\\d+),(\\d+)\\)"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 967838d34f0576..1f65921832bb7e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -119,6 +119,7 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio private TTextSerdeType textSerdeType = TTextSerdeType.JSON_TEXT_SERDE; private String columnSeparator = FileFormatConstants.DEFAULT_COLUMN_SEPARATOR; private String lineDelimiter = FileFormatConstants.DEFAULT_LINE_DELIMITER; + private byte enclose = 0; private String jsonRoot = ""; private String jsonPaths = ""; private boolean stripOuterArray; @@ -230,6 +231,17 @@ protected Map parseCommonProperties(Map properti } lineDelimiter = Separator.convertSeparator(lineDelimiter); + String enclosedString = getOrDefaultAndRemove(copiedProps, FileFormatConstants.PROP_ENCLOSE, ""); + if (!Strings.isNullOrEmpty(enclosedString)) { + if (enclosedString.length() > 1) { + throw new AnalysisException("enclose should not be longer than one byte."); + } + enclose = (byte) enclosedString.charAt(0); + if (enclose == 0) { + throw new AnalysisException("enclose should not be byte [0]."); + } + } + jsonRoot = getOrDefaultAndRemove(copiedProps, FileFormatConstants.PROP_JSON_ROOT, ""); jsonPaths = getOrDefaultAndRemove(copiedProps, FileFormatConstants.PROP_JSON_PATHS, ""); readJsonByLine = Boolean.valueOf( @@ -284,6 +296,9 @@ public TFileAttributes getFileAttributes() { TFileTextScanRangeParams fileTextScanRangeParams = new TFileTextScanRangeParams(); fileTextScanRangeParams.setColumnSeparator(this.columnSeparator); fileTextScanRangeParams.setLineDelimiter(this.lineDelimiter); + if (enclose != 0) { + fileTextScanRangeParams.setEnclose(enclose); + } fileAttributes.setTextParams(fileTextScanRangeParams); if (this.fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) { fileAttributes.setHeaderType(this.headerType); diff --git a/regression-test/data/external_table_p0/tvf/enclose_csv.csv b/regression-test/data/external_table_p0/tvf/enclose_csv.csv new file mode 100644 index 00000000000000..ca787200ff18ca --- /dev/null +++ b/regression-test/data/external_table_p0/tvf/enclose_csv.csv @@ -0,0 +1,6 @@ +id, field1, field2 +"1", "hello", "same, field" +"2", "doris", "same, field2" +"3", "nereids", "same, field3" +"4", "pipeline", "same, field4" +"5", "storage", "same, field5" \ No newline at end of file diff --git a/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out new file mode 100644 index 00000000000000..6e5d10e4858a9c --- /dev/null +++ b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !enclose_1 -- +"1" "hello" "same, field" +"2" "doris" "same, field2" +"3" "nereids" "same, field3" +"4" "pipeline" "same, field4" +"5" "storage" "same, field5" + +-- !enclose_2 -- +1 hello same, field +2 doris same, field2 +3 nereids same, field3 +4 pipeline same, field4 +5 storage same, field5 + diff --git a/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy new file mode 100644 index 00000000000000..e7437cd20ec13e --- /dev/null +++ b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy @@ -0,0 +1,72 @@ +import org.junit.Assert + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_local_tvf_enclose", "p0,tvf") { + List> backends = sql """ show backends """ + assertTrue(backends.size() > 0) + def be_id = backends[0][0] + + String filename = "enclose_csv.csv" + + def dataFilePath = context.config.dataPath + "/external_table_p0/tvf/${filename}" + + def outFilePath="/" + + for (List backend : backends) { + def be_host = backend[1] + scpFiles ("root", be_host, dataFilePath, outFilePath, false); + } + + + sql """set enable_nereids_planner=true""" + sql """set enable_fallback_to_original_planner=false""" + + qt_enclose_1 """ + select * from local( + "file_path" = "${filename}", + "backend_id" = "${be_id}", + "format" = "csv_with_names", + "column_separator" = ", ", + "enclose" = "\\\"") order by id; + """ + + qt_enclose_2 """ + select * from local( + "file_path" = "${filename}", + "backend_id" = "${be_id}", + "format" = "csv_with_names", + "column_separator" = ", ", + "enclose" = "\\\"", + "trim_double_quotes" = "true") order by id; + """ + + // test error case + test { + sql """ + select * from local( + "file_path" = "${filename}", + "backend_id" = "${be_id}", + "format" = "csv_with_names", + "column_separator" = ", ", + "enclose" = "\\\"\\\"") order by id; + """ + // check exception message contains + exception "enclose should not be longer than one byte." + } +}