From 6f967b4db5ca69d9ca530195bfc56e461b1719c6 Mon Sep 17 00:00:00 2001 From: Jingsong Lee Date: Mon, 2 Dec 2024 15:43:53 +0800 Subject: [PATCH] [core] Make metadata.stats-dense-store default value is true (#4617) --- docs/content/flink/sql-ddl.md | 4 +- .../generated/core_configuration.html | 4 +- .../java/org/apache/paimon/CoreOptions.java | 6 +-- .../apache/paimon/stats/StatsTableTest.java | 52 ++++++++++++++++++- .../table/AppendOnlyFileStoreTableTest.java | 1 - .../table/PrimaryKeyFileStoreTableTest.java | 2 - 6 files changed, 58 insertions(+), 11 deletions(-) diff --git a/docs/content/flink/sql-ddl.md b/docs/content/flink/sql-ddl.md index 0324e6655689..8b8c069229aa 100644 --- a/docs/content/flink/sql-ddl.md +++ b/docs/content/flink/sql-ddl.md @@ -203,8 +203,8 @@ Paimon will automatically collect the statistics of the data file for speeding u The statistics collector mode can be configured by `'metadata.stats-mode'`, by default is `'truncate(16)'`. You can configure the field level by setting `'fields.{field_name}.stats-mode'`. -For the stats mode of `none`, we suggest that you configure `metadata.stats-dense-store` = `true`, which will -significantly reduce the storage size of the manifest. +For the stats mode of `none`, by default `metadata.stats-dense-store` is `true`, which will significantly reduce the +storage size of the manifest. But the Paimon sdk in reading engine requires at least version 0.9.1 or 1.0.0 or higher. ### Field Default Value diff --git a/docs/layouts/shortcodes/generated/core_configuration.html b/docs/layouts/shortcodes/generated/core_configuration.html index a38f07784fd3..fad1f4907e5a 100644 --- a/docs/layouts/shortcodes/generated/core_configuration.html +++ b/docs/layouts/shortcodes/generated/core_configuration.html @@ -485,9 +485,9 @@
metadata.stats-dense-store
- false + true Boolean - Whether to store statistic densely in metadata (manifest files), which will significantly reduce the storage size of metadata when the none statistic mode is set.
Note, when this mode is enabled, the Paimon sdk in reading engine requires at least version 0.9.1 or 1.0.0 or higher. + Whether to store statistic densely in metadata (manifest files), which will significantly reduce the storage size of metadata when the none statistic mode is set.
Note, when this mode is enabled with 'metadata.stats-mode:none', the Paimon sdk in reading engine requires at least version 0.9.1 or 1.0.0 or higher.
metadata.stats-mode
diff --git a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java index fce09357f07a..b9b5675f1d2d 100644 --- a/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java +++ b/paimon-common/src/main/java/org/apache/paimon/CoreOptions.java @@ -1101,7 +1101,7 @@ public class CoreOptions implements Serializable { public static final ConfigOption METADATA_STATS_DENSE_STORE = key("metadata.stats-dense-store") .booleanType() - .defaultValue(false) + .defaultValue(true) .withDescription( Description.builder() .text( @@ -1110,8 +1110,8 @@ public class CoreOptions implements Serializable { + " none statistic mode is set.") .linebreak() .text( - "Note, when this mode is enabled, the Paimon sdk in reading engine requires" - + " at least version 0.9.1 or 1.0.0 or higher.") + "Note, when this mode is enabled with 'metadata.stats-mode:none', the Paimon sdk in" + + " reading engine requires at least version 0.9.1 or 1.0.0 or higher.") .build()); public static final ConfigOption COMMIT_CALLBACKS = diff --git a/paimon-core/src/test/java/org/apache/paimon/stats/StatsTableTest.java b/paimon-core/src/test/java/org/apache/paimon/stats/StatsTableTest.java index 494b2e28e459..25282d898a3d 100644 --- a/paimon-core/src/test/java/org/apache/paimon/stats/StatsTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/stats/StatsTableTest.java @@ -35,6 +35,7 @@ import org.junit.jupiter.api.Test; +import static org.apache.paimon.CoreOptions.METADATA_STATS_DENSE_STORE; import static org.apache.paimon.CoreOptions.METADATA_STATS_MODE; import static org.assertj.core.api.Assertions.assertThat; @@ -42,10 +43,11 @@ public class StatsTableTest extends TableTestBase { @Test - public void testPartitionStats() throws Exception { + public void testPartitionStatsNotDense() throws Exception { Identifier identifier = identifier("T"); Options options = new Options(); options.set(METADATA_STATS_MODE, "NONE"); + options.set(METADATA_STATS_DENSE_STORE, false); options.set(CoreOptions.BUCKET, 1); Schema schema = Schema.newBuilder() @@ -90,4 +92,52 @@ public void testPartitionStats() throws Exception { assertThat(recordStats.maxValues().isNullAt(1)).isTrue(); assertThat(recordStats.maxValues().isNullAt(2)).isTrue(); } + + @Test + public void testPartitionStatsDenseMode() throws Exception { + Identifier identifier = identifier("T"); + Options options = new Options(); + options.set(METADATA_STATS_MODE, "NONE"); + options.set(CoreOptions.BUCKET, 1); + Schema schema = + Schema.newBuilder() + .column("pt", DataTypes.INT()) + .column("pk", DataTypes.INT()) + .column("col1", DataTypes.INT()) + .partitionKeys("pt") + .primaryKey("pk", "pt") + .options(options.toMap()) + .build(); + catalog.createTable(identifier, schema, true); + Table table = catalog.getTable(identifier); + + write( + table, + GenericRow.of(1, 1, 1), + GenericRow.of(1, 2, 1), + GenericRow.of(1, 3, 1), + GenericRow.of(2, 1, 1)); + + FileStoreTable storeTable = (FileStoreTable) table; + FileStore store = storeTable.store(); + String manifestListFile = storeTable.snapshotManager().latestSnapshot().deltaManifestList(); + + ManifestList manifestList = store.manifestListFactory().create(); + ManifestFileMeta manifest = manifestList.read(manifestListFile).get(0); + + // should have partition stats + SimpleStats partitionStats = manifest.partitionStats(); + assertThat(partitionStats.minValues().getInt(0)).isEqualTo(1); + assertThat(partitionStats.maxValues().getInt(0)).isEqualTo(2); + + // should not have record stats because of NONE mode + ManifestFile manifestFile = store.manifestFileFactory().create(); + DataFileMeta file = + manifestFile.read(manifest.fileName(), manifest.fileSize()).get(0).file(); + SimpleStats recordStats = file.valueStats(); + assertThat(file.valueStatsCols()).isEmpty(); + assertThat(recordStats.minValues().getFieldCount()).isEqualTo(0); + assertThat(recordStats.maxValues().getFieldCount()).isEqualTo(0); + assertThat(recordStats.nullCounts().size()).isEqualTo(0); + } } diff --git a/paimon-core/src/test/java/org/apache/paimon/table/AppendOnlyFileStoreTableTest.java b/paimon-core/src/test/java/org/apache/paimon/table/AppendOnlyFileStoreTableTest.java index 0328cc6bada3..922221bb8dc9 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/AppendOnlyFileStoreTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/AppendOnlyFileStoreTableTest.java @@ -230,7 +230,6 @@ public void testBatchProjection() throws Exception { public void testBatchFilter(boolean statsDenseStore) throws Exception { Consumer optionsSetter = options -> { - options.set(CoreOptions.METADATA_STATS_DENSE_STORE, statsDenseStore); if (statsDenseStore) { options.set(CoreOptions.METADATA_STATS_MODE, "none"); options.set("fields.b.stats-mode", "full"); diff --git a/paimon-core/src/test/java/org/apache/paimon/table/PrimaryKeyFileStoreTableTest.java b/paimon-core/src/test/java/org/apache/paimon/table/PrimaryKeyFileStoreTableTest.java index 51c8b328dfc6..46b85223bc2f 100644 --- a/paimon-core/src/test/java/org/apache/paimon/table/PrimaryKeyFileStoreTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/table/PrimaryKeyFileStoreTableTest.java @@ -348,7 +348,6 @@ public void testBatchProjection() throws Exception { public void testBatchFilter(boolean statsDenseStore) throws Exception { Consumer optionsSetter = options -> { - options.set(CoreOptions.METADATA_STATS_DENSE_STORE, statsDenseStore); if (statsDenseStore) { // pk table doesn't need value stats options.set(CoreOptions.METADATA_STATS_MODE, "none"); @@ -1664,7 +1663,6 @@ public void testReadDeletionVectorTable(boolean statsDenseStore) throws Exceptio options.set(TARGET_FILE_SIZE, new MemorySize(1)); options.set(DELETION_VECTORS_ENABLED, true); - options.set(CoreOptions.METADATA_STATS_DENSE_STORE, statsDenseStore); if (statsDenseStore) { options.set(CoreOptions.METADATA_STATS_MODE, "none"); options.set("fields.b.stats-mode", "full");