From 268c69971d4ed7ca8d999edbcc12fb4b6f980608 Mon Sep 17 00:00:00 2001 From: AKIRA <33112463+Kikyou1997@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:31:35 +0800 Subject: [PATCH] [fix](stats) Store max/min by base64 --- .../doris/statistics/BaseAnalysisTask.java | 15 +++++----- .../apache/doris/statistics/ColStatsData.java | 8 +++-- .../doris/statistics/ColumnStatistic.java | 9 ++++++ .../doris/statistics/HMSAnalysisTask.java | 8 ++--- .../doris/statistics/JdbcAnalysisTask.java | 4 +-- .../doris/statistics/OlapAnalysisTask.java | 4 ++- .../doris/statistics/StatsMockUtil.java | 3 +- .../jdbc/test_mysql_jdbc_statistics.groovy | 4 +-- .../suites/statistics/analyze_stats.groovy | 29 ++++++++++++++++++- 9 files changed, 64 insertions(+), 20 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index ad74266a7c365a..a73d2a2c06b045 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -78,7 +78,8 @@ public abstract class BaseAnalysisTask { protected static final String INSERT_COL_STATISTICS = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, " - + " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n" + + " ndv, null_count," + + " to_base64(CAST(min AS string)), to_base64(CAST(max AS string)), data_size, update_time\n" + " FROM \n" + " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + " ${catalogId} AS catalog_id, " @@ -89,8 +90,8 @@ public abstract class BaseAnalysisTask { + " NULL AS part_id, " + " SUM(count) AS row_count, \n" + " SUM(null_count) AS null_count, " - + " MIN(CAST(min AS ${type})) AS min, " - + " MAX(CAST(max AS ${type})) AS max, " + + " MIN(CAST(from_base64(min) AS ${type})) AS min, " + + " MAX(CAST(from_base64(max) AS ${type})) AS max, " + " SUM(data_size_in_bytes) AS data_size, " + " NOW() AS update_time \n" + " FROM ${internalDB}.${columnStatTbl}" @@ -114,8 +115,8 @@ public abstract class BaseAnalysisTask { + "${row_count} AS row_count, " + "${ndv} AS ndv, " + "${null_count} AS null_count, " - + "'${min}' AS min, " - + "'${max}' AS max, " + + "to_base64('${min}') AS min, " + + "to_base64('${max}') AS max, " + "${data_size} AS data_size, " + "NOW() "; @@ -241,7 +242,7 @@ protected String getDataSizeFunction(Column column) { // Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. protected String getMinFunction() { if (tableSample == null) { - return "MIN(`${colName}`) "; + return "MIN(CAST(min AS ${type}))"; } else { return "NULL "; } @@ -250,7 +251,7 @@ protected String getMinFunction() { // Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. protected String getMaxFunction() { if (tableSample == null) { - return "MAX(`${colName}`) "; + return "MAX(CAST(min AS ${type}))"; } else { return "NULL "; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index 3cbd1b5a61129b..6c94326a9424ec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -19,6 +19,8 @@ import org.apache.doris.statistics.util.StatisticsUtil; +import java.nio.charset.StandardCharsets; +import java.util.Base64; import java.util.StringJoiner; /** @@ -73,8 +75,10 @@ public String toSQL(boolean roundByParentheses) { sj.add(String.valueOf(count)); sj.add(String.valueOf(ndv)); sj.add(String.valueOf(nullCount)); - sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(minLit))); - sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(maxLit))); + sj.add(minLit == null ? "NULL" : + "'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'"); + sj.add(maxLit == null ? "NULL" : + "'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'"); sj.add(String.valueOf(dataSizeInBytes)); sj.add(StatisticsUtil.quote(updateTime)); return sj.toString(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index c6b019f669b65d..82e0efdac1784d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -30,6 +30,8 @@ import org.apache.logging.log4j.Logger; import org.json.JSONObject; +import java.nio.charset.StandardCharsets; +import java.util.Base64; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -172,6 +174,9 @@ public static ColumnStatistic fromResultRow(ResultRow row) { String min = row.get(10); String max = row.get(11); if (min != null && !min.equalsIgnoreCase("NULL")) { + min = new String(Base64.getDecoder().decode(min), + StandardCharsets.UTF_8); + try { columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min)); columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min)); @@ -183,6 +188,10 @@ public static ColumnStatistic fromResultRow(ResultRow row) { columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } if (max != null && !max.equalsIgnoreCase("NULL")) { + + max = new String(Base64.getDecoder().decode(max), + StandardCharsets.UTF_8); + try { columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max)); columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 188665645c386f..4583237f8c61f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -64,8 +64,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + NDV_SAMPLE_TEMPLATE + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " - + "${minFunction} AS min, " - + "${maxFunction} AS max, " + + "to_base64(${minFunction}) AS min, " + + "to_base64(${maxFunction}) AS max, " + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; @@ -81,8 +81,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "COUNT(1) AS row_count, " + "NDV(`${colName}`) AS ndv, " + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " + + "to_base64(MIN(`${colName}`)) AS min, " + + "to_base64(MAX(`${colName}`)) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index 0c148b5ad8d7f3..5ae66d292dc43d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -49,8 +49,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask { + "COUNT(1) AS row_count, " + "NDV(`${colName}`) AS ndv, " + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " + + "to_base64(MIN(`${colName}`)) AS min, " + + "to_base64(MAX(`${colName}`)) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 2df7b9c358d568..185a582cde436e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -37,6 +37,7 @@ import java.security.SecureRandom; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; @@ -121,8 +122,9 @@ protected void doSample() throws Exception { List tabletIds = pair.first; double scaleFactor = (double) tbl.getRowCount() / (double) pair.second; // might happen if row count in fe metadata hasn't been updated yet - if (Double.isInfinite(scaleFactor)) { + if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) { scaleFactor = 1; + tabletIds = Collections.emptyList(); } String tabletStr = tabletIds.stream() .map(Object::toString) diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java index 21035051ff8606..84e1112d216cdb 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java @@ -39,7 +39,8 @@ public static ResultRow mockResultRow(boolean col) { add("8"); add("0"); add("10"); - add("11"); + // 11 + add("MTE="); add("12"); add(String.valueOf(System.currentTimeMillis())); }}; diff --git a/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy b/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy index e58e17cfcd6d41..f73ab3aaad99f1 100644 --- a/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy +++ b/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy @@ -44,8 +44,8 @@ suite("test_mysql_jdbc_statistics", "p0,external,mysql,external_docker,external_ assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "15.0") assertTrue(result[0][5] == "3.0") - assertTrue(result[0][6] == "'abc'") - assertTrue(result[0][7] == "'abg'") + assertEquals(result[0][6], "'abc'") + assertEquals(result[0][7], "'abg'") result = sql """show column stats ex_tb0 (id)""" assertTrue(result.size() == 1) diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index f6967551ea9852..4e4c4a08425944 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -881,7 +881,7 @@ PARTITION `p599` VALUES IN (599) sql """ANALYZE TABLE test_600_partition_table_analyze WITH SYNC""" - // column_name | count | ndv | num_null | data_size | avg_size_byte | min | max | updated_time + // 0:column_name | 1:count | 2:ndv | 3:num_null | 4:data_size | 5:avg_size_byte | 6:min | 7:max | 8:updated_time id_col_stats = sql """ SHOW COLUMN CACHED STATS test_600_partition_table_analyze(id); """ @@ -1124,4 +1124,31 @@ PARTITION `p599` VALUES IN (599) result = sql """SHOW COLUMN STATS test_analyze_specific_column""" assert result.size() == 1 + // test escape sql + sql """ + DROP TABLE IF EXISTS test_max_min_lit; + """ + + sql """ + CREATE TABLE test_max_min_lit ( + `col1` varchar(32) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`col1`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`col1`) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """INSERT INTO test_max_min_lit VALUES("\\'")""" + sql """INSERT INTO test_max_min_lit VALUES('\\';')""" + sql "INSERT INTO test_max_min_lit VALUES('测试')" + + sql """ANALYZE TABLE test_max_min_lit WITH SYNC""" + def max = sql """show column cached stats test_max_min_lit""" + def expected_max = { r, expected_value -> + return (r[0][7]).equals(expected_value) + } + expected_max(max, "测试") }