From 6b50368d38416f83a3feb0258ad39768ba09db7d Mon Sep 17 00:00:00 2001 From: yi wang <48236141+my-vegetable-has-exploded@users.noreply.github.com> Date: Fri, 17 May 2024 16:08:23 +0800 Subject: [PATCH] [fix](default value) enable HLL_EMPTY_DEFAULT_VALUE (#34447) The goal of this pr is to make it possible to use `HLL_EMPTY()` to fill missing hll column. As shown in `test_default_hll.groovy`, if we don't specify hll column, it will use HLL_EMPTY to fill it. https://github.com/apache/doris/blob/f180d90bb8db1af8bee43e095de55cfe72e95ecc/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java#L362-L367 Existing analysis will add HLL_EMPTY_DEFAULT_VALUE as default value for hll column, but some relative problem would occurs if we didn't pass `hll_hash(x)` ... for this column. This pr add `defaultValueExprDef` for HLL_EMPTY_DEFAULT_VALUE (whick is used in streamload mode) and make default hll column pass the analysis for `NativeInsertStmt` and `Load`. --- .../org/apache/doris/analysis/ColumnDef.java | 3 +- .../doris/analysis/NativeInsertStmt.java | 3 +- .../doris/datasource/FileQueryScanNode.java | 2 +- .../apache/doris/datasource/FileScanNode.java | 8 ++ .../plans/commands/info/DefaultValue.java | 3 +- .../doris/planner/FileLoadScanNode.java | 3 +- .../data/correctness_p0/test_default_hll.out | 29 ++++++ .../test_default_hll_streamload.csv | 2 + .../correctness_p0/test_default_hll.groovy | 99 +++++++++++++++++++ 9 files changed, 147 insertions(+), 5 deletions(-) create mode 100644 regression-test/data/correctness_p0/test_default_hll.out create mode 100644 regression-test/data/correctness_p0/test_default_hll_streamload.csv create mode 100644 regression-test/suites/correctness_p0/test_default_hll.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java index 46a48339d54d03..efb109a9233d2f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java @@ -98,6 +98,7 @@ public DefaultValue(boolean isSet, String value, String exprName, Long precision // default "CURRENT_TIMESTAMP", only for DATETIME type public static String CURRENT_TIMESTAMP = "CURRENT_TIMESTAMP"; public static String NOW = "now"; + public static String HLL_EMPTY = "HLL_EMPTY"; public static DefaultValue CURRENT_TIMESTAMP_DEFAULT_VALUE = new DefaultValue(true, CURRENT_TIMESTAMP, NOW); // no default value public static DefaultValue NOT_SET = new DefaultValue(false, null); @@ -105,7 +106,7 @@ public DefaultValue(boolean isSet, String value, String exprName, Long precision public static DefaultValue NULL_DEFAULT_VALUE = new DefaultValue(true, null); public static String ZERO = new String(new byte[] {0}); // default "value", "0" means empty hll - public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO); + public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO, HLL_EMPTY); // default "value", "0" means empty bitmap public static DefaultValue BITMAP_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO); // default "value", "[]" means empty array diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/NativeInsertStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/NativeInsertStmt.java index ab1aaad679a8b4..c4ef8f6597f6a3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/NativeInsertStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/NativeInsertStmt.java @@ -590,7 +590,8 @@ private void analyzeSubquery(Analyzer analyzer, boolean skipCheck) throws UserEx } // hll column must in mentionedColumns for (Column col : targetTable.getBaseSchema()) { - if (col.getType().isObjectStored() && !mentionedColumns.contains(col.getName())) { + if (col.getType().isObjectStored() && !col.hasDefaultValue() + && !mentionedColumns.contains(col.getName())) { throw new AnalysisException( "object-stored column " + col.getName() + " must in insert into columns"); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java index ff25464b4c9522..90c24989275787 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java @@ -173,7 +173,7 @@ protected void initSchemaParams() throws UserException { slotInfo.setIsFileSlot(!partitionKeys.contains(slot.getColumn().getName())); params.addToRequiredSlots(slotInfo); } - setDefaultValueExprs(getTargetTable(), destSlotDescByName, params, false); + setDefaultValueExprs(getTargetTable(), destSlotDescByName, null, params, false); setColumnPositionMapping(); // For query, set src tuple id to -1. params.setSrcTupleId(-1); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java index 44f07718485c64..92cdfbcfa1f3ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileScanNode.java @@ -181,6 +181,7 @@ public int compare(TFileRangeDesc o1, TFileRangeDesc o2) { protected void setDefaultValueExprs(TableIf tbl, Map slotDescByName, + Map exprByName, TFileScanRangeParams params, boolean useVarcharAsNull) throws UserException { Preconditions.checkNotNull(tbl); @@ -208,6 +209,13 @@ protected void setDefaultValueExprs(TableIf tbl, expr = null; } } + // if there is already an expr , just skip it. + // eg: + // (a, b, c, c=hll_hash(c)) in stream load + // c will be filled with hll_hash(column c) , don't need to specify it. + if (exprByName != null && exprByName.containsKey(column.getName())) { + continue; + } SlotDescriptor slotDesc = slotDescByName.get(column.getName()); // if slot desc is null, which mean it is an unrelated slot, just skip. // eg: diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DefaultValue.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DefaultValue.java index 8e7228705ac2a5..48a22daf7b7d6d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DefaultValue.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/DefaultValue.java @@ -27,6 +27,7 @@ public class DefaultValue { public static String CURRENT_DATE = "CURRENT_DATE"; public static String CURRENT_TIMESTAMP = "CURRENT_TIMESTAMP"; public static String NOW = "now"; + public static String HLL_EMPTY = "HLL_EMPTY"; public static DefaultValue CURRENT_DATE_DEFAULT_VALUE = new DefaultValue(CURRENT_DATE, CURRENT_DATE.toLowerCase()); public static DefaultValue CURRENT_TIMESTAMP_DEFAULT_VALUE = new DefaultValue(CURRENT_TIMESTAMP, NOW); // default null @@ -34,7 +35,7 @@ public class DefaultValue { public static String ZERO = new String(new byte[] {0}); public static String ZERO_NUMBER = "0"; // default "value", "0" means empty hll - public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO); + public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO, HLL_EMPTY); // default "value", "0" means empty bitmap public static DefaultValue BITMAP_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO); // default "value", "[]" means empty array diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/FileLoadScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/FileLoadScanNode.java index ca0324a51d0d93..9ecbe31673a2c2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/FileLoadScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/FileLoadScanNode.java @@ -131,7 +131,8 @@ private void initParamCreateContexts(Analyzer analyzer) throws UserException { // FIXME(cmy): we should support set different expr for different file group. initAndSetPrecedingFilter(context.fileGroup.getPrecedingFilterExpr(), context.srcTupleDescriptor, analyzer); initAndSetWhereExpr(context.fileGroup.getWhereExpr(), context.destTupleDescriptor, analyzer); - setDefaultValueExprs(scanProvider.getTargetTable(), context.srcSlotDescByName, context.params, true); + setDefaultValueExprs(scanProvider.getTargetTable(), context.srcSlotDescByName, + context.exprMap, context.params, true); this.contexts.add(context); } } diff --git a/regression-test/data/correctness_p0/test_default_hll.out b/regression-test/data/correctness_p0/test_default_hll.out new file mode 100644 index 00000000000000..c2e8103ded6aef --- /dev/null +++ b/regression-test/data/correctness_p0/test_default_hll.out @@ -0,0 +1,29 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !insert_into1 -- +0 +0 +0 +0 + +-- !stream_load_csv1 -- +0 +0 +0 +0 +1 +1 + +-- !select_1 -- +0 +0 +0 +0 + +-- !stream_load_csv1 -- +0 +0 +0 +0 +0 +0 + diff --git a/regression-test/data/correctness_p0/test_default_hll_streamload.csv b/regression-test/data/correctness_p0/test_default_hll_streamload.csv new file mode 100644 index 00000000000000..f4ec2d7748a0a2 --- /dev/null +++ b/regression-test/data/correctness_p0/test_default_hll_streamload.csv @@ -0,0 +1,2 @@ +5,5 +6,6 \ No newline at end of file diff --git a/regression-test/suites/correctness_p0/test_default_hll.groovy b/regression-test/suites/correctness_p0/test_default_hll.groovy new file mode 100644 index 00000000000000..b21869e30e365c --- /dev/null +++ b/regression-test/suites/correctness_p0/test_default_hll.groovy @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_default_hll") { + def tableName = "test_default_hll" + + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} + ( + k TINYINT, + v1 DECIMAL(10, 2) DEFAULT "0", + h1 hll NOT NULL COMMENT "hll column" + ) + UNIQUE KEY(K) + DISTRIBUTED BY HASH(k) + PROPERTIES("replication_num" = "1"); + """ + + // test insert into. + sql " insert into ${tableName} (k, v1, h1) values (1, 1, hll_empty()); " + sql " insert into ${tableName} (k, v1, h1) values (2, 2, hll_empty()); " + sql " insert into ${tableName} (k, v1, h1) values (3, 3, hll_empty()); " + sql " insert into ${tableName} (k, v1, h1) values (4, 4, hll_empty()); " + sql "sync" + qt_insert_into1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """ + + // test csv stream load. + streamLoad { + table "${tableName}" + + set 'column_separator', ',' + set 'columns', 'k, v1, h1=hll_hash(k)' + + file 'test_default_hll_streamload.csv' + + time 10000 // limit inflight 10s + } + + sql "sync" + + qt_stream_load_csv1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """ + + // test partial update + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} + ( + k TINYINT, + v1 DECIMAL(10, 2) DEFAULT "0", + h1 hll NOT NULL COMMENT "hll column" + ) + UNIQUE KEY(K) + DISTRIBUTED BY HASH(k) + PROPERTIES("replication_num" = "1"); + """ + + sql "set enable_unique_key_partial_update=true;" + sql "set enable_insert_strict=false;" + + sql " insert into ${tableName} (k, v1) values (1, 1); " + sql " insert into ${tableName} (k, v1) values (2, 2); " + sql " insert into ${tableName} (k, v1) values (3, 3); " + sql " insert into ${tableName} (k, v1) values (4, 4); " + sql "sync" + + qt_select_1 "select HLL_CARDINALITY(h1) from ${tableName} order by k;" + + streamLoad { + table "${tableName}" + + set 'partial_columns', 'true' + set 'column_separator', ',' + set 'columns', 'k, v1' + + file 'test_default_hll_streamload.csv' + + time 10000 // limit inflight 10s + } + + sql "sync" + + qt_stream_load_csv1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """ + +} \ No newline at end of file