From b0b634ffa113f497228745e46b1b23fd92fbcfc0 Mon Sep 17 00:00:00 2001 From: wgcn <1026688210@qq.com> Date: Mon, 20 May 2024 22:06:07 +0800 Subject: [PATCH] [core][hive] decrease the size of FileIo serialization (#3348) --- docs/content/engines/hive.md | 4 +++- .../java/org/apache/paimon/utils/HadoopUtils.java | 13 ++++++++++++- .../org/apache/paimon/hive/utils/HiveUtils.java | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/content/engines/hive.md b/docs/content/engines/hive.md index 1bf3ad9009dd..5d143913a8b2 100644 --- a/docs/content/engines/hive.md +++ b/docs/content/engines/hive.md @@ -81,7 +81,9 @@ There are several ways to add this jar to Hive. NOTE: -* If you are using HDFS, make sure that the environment variable `HADOOP_HOME` or `HADOOP_CONF_DIR` is set. +* If you are using HDFS : + * Make sure that the environment variable `HADOOP_HOME` or `HADOOP_CONF_DIR` is set. + * You can set `paimon.hadoop-load-default-config` =`false` to disable loading the default value from `core-default.xml`、`hdfs-default.xml`, which may lead smaller size for split. * With hive cbo, it may lead to some incorrect query results, such as to query `struct` type with `not null` predicate, you can disable the cbo by `set hive.cbo.enable=false;` command. ## Hive SQL: access Paimon Tables already in Hive metastore diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/HadoopUtils.java b/paimon-common/src/main/java/org/apache/paimon/utils/HadoopUtils.java index 7d0ec0e2c49c..1227338a0029 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/HadoopUtils.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/HadoopUtils.java @@ -47,6 +47,13 @@ public class HadoopUtils { .defaultValue(HadoopConfigLoader.ALL) .withDescription("Specifies the way of loading hadoop config."); + public static final ConfigOption HADOOP_LOAD_DEFAULT_CONFIG = + key("hadoop-load-default-config") + .booleanType() + .defaultValue(true) + .withDescription( + "Specifies whether load the default configuration from core-default.xml、hdfs-default.xml, which may lead larger size for the serialization of table."); + private static final String[] CONFIG_PREFIXES = {"hadoop."}; public static final String HADOOP_HOME_ENV = "HADOOP_HOME"; public static final String HADOOP_CONF_ENV = "HADOOP_CONF_DIR"; @@ -59,7 +66,11 @@ public static Configuration getHadoopConfiguration(Options options) { // Instantiate an HdfsConfiguration to load the hdfs-site.xml and hdfs-default.xml // from the classpath - Configuration result = new HdfsConfiguration(); + Boolean loadDefaultConfig = options.get(HADOOP_LOAD_DEFAULT_CONFIG); + if (loadDefaultConfig) { + LOG.debug("Load the default value for configuration."); + } + Configuration result = new HdfsConfiguration(loadDefaultConfig); boolean foundHadoopConfiguration = false; // We need to load both core-site.xml and hdfs-site.xml to determine the default fs path and diff --git a/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/utils/HiveUtils.java b/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/utils/HiveUtils.java index 4ecd32b74dae..c6828897d43c 100644 --- a/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/utils/HiveUtils.java +++ b/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/utils/HiveUtils.java @@ -54,7 +54,7 @@ public class HiveUtils { public static FileStoreTable createFileStoreTable(JobConf jobConf) { Options options = extractCatalogConfig(jobConf); options.set(CoreOptions.PATH, LocationKeyExtractor.getPaimonLocation(jobConf)); - CatalogContext catalogContext = CatalogContext.create(options, jobConf); + CatalogContext catalogContext = CatalogContext.create(options); return FileStoreTableFactory.create(catalogContext); }