Skip to content

Commit

Permalink
[fix](default value) enable HLL_EMPTY_DEFAULT_VALUE (apache#34447)
Browse files Browse the repository at this point in the history
The goal of this pr is to make it possible to use `HLL_EMPTY()` to fill missing hll column.  As shown in `test_default_hll.groovy`,  if we don't specify hll column, it will use HLL_EMPTY to fill it.

https://github.com/apache/doris/blob/f180d90bb8db1af8bee43e095de55cfe72e95ecc/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java#L362-L367

Existing analysis will add HLL_EMPTY_DEFAULT_VALUE as default value for hll column, but some relative problem would occurs if we didn't pass `hll_hash(x)` ... for this column. 

This pr add  `defaultValueExprDef` for HLL_EMPTY_DEFAULT_VALUE (whick is used in streamload mode) and make default hll column pass the analysis for `NativeInsertStmt` and `Load`.
  • Loading branch information
my-vegetable-has-exploded authored May 17, 2024
1 parent 813a697 commit 6b50368
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,15 @@ public DefaultValue(boolean isSet, String value, String exprName, Long precision
// default "CURRENT_TIMESTAMP", only for DATETIME type
public static String CURRENT_TIMESTAMP = "CURRENT_TIMESTAMP";
public static String NOW = "now";
public static String HLL_EMPTY = "HLL_EMPTY";
public static DefaultValue CURRENT_TIMESTAMP_DEFAULT_VALUE = new DefaultValue(true, CURRENT_TIMESTAMP, NOW);
// no default value
public static DefaultValue NOT_SET = new DefaultValue(false, null);
// default null
public static DefaultValue NULL_DEFAULT_VALUE = new DefaultValue(true, null);
public static String ZERO = new String(new byte[] {0});
// default "value", "0" means empty hll
public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO);
public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO, HLL_EMPTY);
// default "value", "0" means empty bitmap
public static DefaultValue BITMAP_EMPTY_DEFAULT_VALUE = new DefaultValue(true, ZERO);
// default "value", "[]" means empty array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,8 @@ private void analyzeSubquery(Analyzer analyzer, boolean skipCheck) throws UserEx
}
// hll column must in mentionedColumns
for (Column col : targetTable.getBaseSchema()) {
if (col.getType().isObjectStored() && !mentionedColumns.contains(col.getName())) {
if (col.getType().isObjectStored() && !col.hasDefaultValue()
&& !mentionedColumns.contains(col.getName())) {
throw new AnalysisException(
"object-stored column " + col.getName() + " must in insert into columns");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ protected void initSchemaParams() throws UserException {
slotInfo.setIsFileSlot(!partitionKeys.contains(slot.getColumn().getName()));
params.addToRequiredSlots(slotInfo);
}
setDefaultValueExprs(getTargetTable(), destSlotDescByName, params, false);
setDefaultValueExprs(getTargetTable(), destSlotDescByName, null, params, false);
setColumnPositionMapping();
// For query, set src tuple id to -1.
params.setSrcTupleId(-1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ public int compare(TFileRangeDesc o1, TFileRangeDesc o2) {

protected void setDefaultValueExprs(TableIf tbl,
Map<String, SlotDescriptor> slotDescByName,
Map<String, Expr> exprByName,
TFileScanRangeParams params,
boolean useVarcharAsNull) throws UserException {
Preconditions.checkNotNull(tbl);
Expand Down Expand Up @@ -208,6 +209,13 @@ protected void setDefaultValueExprs(TableIf tbl,
expr = null;
}
}
// if there is already an expr , just skip it.
// eg:
// (a, b, c, c=hll_hash(c)) in stream load
// c will be filled with hll_hash(column c) , don't need to specify it.
if (exprByName != null && exprByName.containsKey(column.getName())) {
continue;
}
SlotDescriptor slotDesc = slotDescByName.get(column.getName());
// if slot desc is null, which mean it is an unrelated slot, just skip.
// eg:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ public class DefaultValue {
public static String CURRENT_DATE = "CURRENT_DATE";
public static String CURRENT_TIMESTAMP = "CURRENT_TIMESTAMP";
public static String NOW = "now";
public static String HLL_EMPTY = "HLL_EMPTY";
public static DefaultValue CURRENT_DATE_DEFAULT_VALUE = new DefaultValue(CURRENT_DATE, CURRENT_DATE.toLowerCase());
public static DefaultValue CURRENT_TIMESTAMP_DEFAULT_VALUE = new DefaultValue(CURRENT_TIMESTAMP, NOW);
// default null
public static DefaultValue NULL_DEFAULT_VALUE = new DefaultValue(null);
public static String ZERO = new String(new byte[] {0});
public static String ZERO_NUMBER = "0";
// default "value", "0" means empty hll
public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO);
public static DefaultValue HLL_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO, HLL_EMPTY);
// default "value", "0" means empty bitmap
public static DefaultValue BITMAP_EMPTY_DEFAULT_VALUE = new DefaultValue(ZERO);
// default "value", "[]" means empty array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ private void initParamCreateContexts(Analyzer analyzer) throws UserException {
// FIXME(cmy): we should support set different expr for different file group.
initAndSetPrecedingFilter(context.fileGroup.getPrecedingFilterExpr(), context.srcTupleDescriptor, analyzer);
initAndSetWhereExpr(context.fileGroup.getWhereExpr(), context.destTupleDescriptor, analyzer);
setDefaultValueExprs(scanProvider.getTargetTable(), context.srcSlotDescByName, context.params, true);
setDefaultValueExprs(scanProvider.getTargetTable(), context.srcSlotDescByName,
context.exprMap, context.params, true);
this.contexts.add(context);
}
}
Expand Down
29 changes: 29 additions & 0 deletions regression-test/data/correctness_p0/test_default_hll.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !insert_into1 --
0
0
0
0

-- !stream_load_csv1 --
0
0
0
0
1
1

-- !select_1 --
0
0
0
0

-- !stream_load_csv1 --
0
0
0
0
0
0

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
5,5
6,6
99 changes: 99 additions & 0 deletions regression-test/suites/correctness_p0/test_default_hll.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_default_hll") {
def tableName = "test_default_hll"

sql """ DROP TABLE IF EXISTS ${tableName} """
sql """
CREATE TABLE IF NOT EXISTS ${tableName}
(
k TINYINT,
v1 DECIMAL(10, 2) DEFAULT "0",
h1 hll NOT NULL COMMENT "hll column"
)
UNIQUE KEY(K)
DISTRIBUTED BY HASH(k)
PROPERTIES("replication_num" = "1");
"""

// test insert into.
sql " insert into ${tableName} (k, v1, h1) values (1, 1, hll_empty()); "
sql " insert into ${tableName} (k, v1, h1) values (2, 2, hll_empty()); "
sql " insert into ${tableName} (k, v1, h1) values (3, 3, hll_empty()); "
sql " insert into ${tableName} (k, v1, h1) values (4, 4, hll_empty()); "
sql "sync"
qt_insert_into1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """

// test csv stream load.
streamLoad {
table "${tableName}"

set 'column_separator', ','
set 'columns', 'k, v1, h1=hll_hash(k)'

file 'test_default_hll_streamload.csv'

time 10000 // limit inflight 10s
}

sql "sync"

qt_stream_load_csv1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """

// test partial update
sql """ DROP TABLE IF EXISTS ${tableName} """
sql """
CREATE TABLE IF NOT EXISTS ${tableName}
(
k TINYINT,
v1 DECIMAL(10, 2) DEFAULT "0",
h1 hll NOT NULL COMMENT "hll column"
)
UNIQUE KEY(K)
DISTRIBUTED BY HASH(k)
PROPERTIES("replication_num" = "1");
"""

sql "set enable_unique_key_partial_update=true;"
sql "set enable_insert_strict=false;"

sql " insert into ${tableName} (k, v1) values (1, 1); "
sql " insert into ${tableName} (k, v1) values (2, 2); "
sql " insert into ${tableName} (k, v1) values (3, 3); "
sql " insert into ${tableName} (k, v1) values (4, 4); "
sql "sync"

qt_select_1 "select HLL_CARDINALITY(h1) from ${tableName} order by k;"

streamLoad {
table "${tableName}"

set 'partial_columns', 'true'
set 'column_separator', ','
set 'columns', 'k, v1'

file 'test_default_hll_streamload.csv'

time 10000 // limit inflight 10s
}

sql "sync"

qt_stream_load_csv1 """ select HLL_CARDINALITY(h1) from ${tableName} order by k; """

}

0 comments on commit 6b50368

Please sign in to comment.