diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py index e3ab444..001b9b2 100644 --- a/scripts/generate_test_data.py +++ b/scripts/generate_test_data.py @@ -180,7 +180,7 @@ def generate_test_data_pyspark(name, current_path, input_path, delete_predicate ## Partitioned table with all types we can file skip on for type in ["bool", "int", "tinyint", "smallint", "bigint", "float", "double", "varchar"]: - query = f"CREATE table test_table as select i::{type} as value, i::{type} as part from range(0,2) tbl(i)" + query = f"CREATE table test_table as select i::{type} as value1, (i)::{type} as value2, (i)::{type} as value3, i::{type} as part from range(0,5) tbl(i)" generate_test_data_delta_rs(f"test_file_skipping/{type}", query, "part") ## Simple table with deletion vector diff --git a/src/delta_extension.cpp b/src/delta_extension.cpp index 36003a3..0c21ade 100644 --- a/src/delta_extension.cpp +++ b/src/delta_extension.cpp @@ -52,6 +52,11 @@ static void LoadInternal(DatabaseInstance &instance) { // Register the "single table" delta catalog (to ATTACH a single delta table) auto &config = DBConfig::GetConfig(instance); config.storage_extensions["delta"] = make_uniq(); + + config.AddExtensionOption("delta_scan_explain_files_filtered", + "Adds the filtered files to the explain output. Warning: this may change performance of " + "delta scan during explain analyze queries.", + LogicalType::BOOLEAN, Value(true)); } void DeltaExtension::Load(DuckDB &db) { diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 377c5c0..fb4bbe4 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -15,6 +15,7 @@ #include "duckdb/parser/parsed_expression.hpp" #include "duckdb/planner/binder.hpp" #include "duckdb/planner/operator/logical_get.hpp" +#include "duckdb/main/query_profiler.hpp" #include #include @@ -523,12 +524,12 @@ unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &co return nullptr; } - for (const auto &filter : filters) { - combiner.AddFilter(filter->Copy()); + for (auto riter = filters.rbegin(); riter != filters.rend(); ++riter) { + combiner.AddFilter(riter->get()->Copy()); } + auto filterstmp = combiner.GenerateTableScanFilters(info.column_indexes); - // TODO: can/should we figure out if this filtered anything? auto filtered_list = make_uniq(context, paths[0]); filtered_list->table_filters = std::move(filterstmp); filtered_list->names = names; @@ -536,6 +537,54 @@ unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &co // Copy over the snapshot, this avoids reparsing metadata filtered_list->snapshot = snapshot; + auto &profiler = QueryProfiler::Get(context); + + // Note: this is potentially quite expensive: we are creating 2 scans of the snapshot and fully materializing both + // file lists Therefore this is only done when profile is enabled. This is enable by default in debug mode or for + // EXPLAIN ANALYZE queries + if (profiler.IsEnabled()) { + Value result; + if (!context.TryGetCurrentSetting("delta_scan_explain_files_filtered", result)) { + throw InternalException("Failed to find 'delta_scan_explain_files_filtered' option!"); + } else if (result.GetValue()) { + auto old_total = GetTotalFileCount(); + auto new_total = filtered_list->GetTotalFileCount(); + + if (old_total != new_total) { + string filters_info; + bool first_item = true; + for (auto &f : filtered_list->table_filters.filters) { + auto &column_index = f.first; + auto &filter = f.second; + if (column_index < names.size()) { + if (!first_item) { + filters_info += "\n"; + } + first_item = false; + auto &col_name = names[column_index]; + filters_info += filter->ToString(col_name); + } + } + + info.extra_info.file_filters = filters_info; + } + + if (!info.extra_info.total_files.IsValid()) { + info.extra_info.total_files = old_total; + } else if (info.extra_info.total_files.GetIndex() < old_total) { + throw InternalException( + "Error encountered when analyzing filtered out files for delta scan: total_files inconsistent!"); + } + + if (!info.extra_info.filtered_files.IsValid() || info.extra_info.filtered_files.GetIndex() >= new_total) { + info.extra_info.filtered_files = new_total; + } else { + throw InternalException( + "Error encountered when analyzing filtered out files for delta scan: filtered_files inconsistent!"); + } + } + } + return std::move(filtered_list); } diff --git a/test/sql/generated/file_skipping_all_types.test b/test/sql/generated/file_skipping_all_types.test index e4348e8..e2b90ea 100644 --- a/test/sql/generated/file_skipping_all_types.test +++ b/test/sql/generated/file_skipping_all_types.test @@ -8,37 +8,127 @@ require delta require-env GENERATED_DATA_AVAILABLE -# TODO: this doesn't appear to skip files yet -# TODO: add tests once https://github.com/duckdb/duckdb/pull/12488 is available +foreach type float double -query I -select value -from delta_scan('./data/generated/test_file_skipping/bool/delta_lake') -where part != false -order by value +# using column to skip files +query II +EXPLAIN ANALYZE SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE + value1 > 0.5 and + value2 > 2.5 and + value3 < 3.5 ---- -true +analyzed_plan :.*File Filters:.*value1>0.5.*value2>2.5.*value3<3.5.*Scanning Files: 1/5.* -foreach type bool int tinyint smallint bigint varchar +query III +SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE + value1 > 0.5 and + value2 > 2.5 and + value3 < 3.5 +---- +3.0 3.0 3.0 -query I -select value -from delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') -where part != 0 -order by value +# FIXME: Partition columns currently don't cause file skipping yet +query II +EXPLAIN ANALYZE SELECT part +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE part > 0.5 ---- -1 +analyzed_plan :.*File Filters:.* endloop -foreach type float double +# use bool column to skip files +query II +EXPLAIN ANALYZE SELECT * +FROM delta_scan('./data/generated/test_file_skipping/bool/delta_lake') +WHERE value1=false +---- +analyzed_plan :.*File Filters:.*value1=false.*Scanning Files: 1/2.* + +# FIXME: Partition columns currently don't cause file skipping yet +query II +EXPLAIN ANALYZE SELECT part +FROM delta_scan('./data/generated/test_file_skipping/bool/delta_lake') +WHERE part=false +---- +analyzed_plan :.*File Filters:.* + +foreach type int tinyint smallint bigint + +# using column to skip files +query II +EXPLAIN ANALYZE SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE + value1 > 1 and + value2 > 2 and + value3 < 4 +---- +analyzed_plan :.*File Filters:.*value1>1.*value2>2.*value3<4.*Scanning Files: 1/5.* + +query III +SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE + value1 > 1 and + value2 > 2 and + value3 < 4 +---- +3 3 3 -query I -select value -from delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') -where part > 0.5 -order by value +# FIXME: Partition columns currently don't cause file skipping yet +query II +EXPLAIN ANALYZE SELECT part +FROM delta_scan('./data/generated/test_file_skipping/${type}/delta_lake') +WHERE part = 0 ---- -1.0 +analyzed_plan :.*File Filters:.* endloop + +# using column to skip files +query II +EXPLAIN ANALYZE SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/varchar/delta_lake') +WHERE + value1 = '2' and + value2 = '2' and + value3 = '2' +---- +analyzed_plan :.*File Filters:.*value1='2'.*value2='2'.*value3='2'.*Scanning Files: 1/5.* + +query III +SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/varchar/delta_lake') +WHERE + value1 = '2' and + value2 = '2' and + value3 = '2' +---- +2 2 2 + +# FIXME: Partition columns currently don't cause file skipping yet +query II +EXPLAIN ANALYZE SELECT part +FROM delta_scan('./data/generated/test_file_skipping/varchar/delta_lake') +WHERE part = '0' +---- +analyzed_plan :.*File Filters:.* + +# We can remove this from output if precise operator timing is crucial +statement ok +set delta_scan_explain_files_filtered = false; + +query II +EXPLAIN ANALYZE SELECT value1, value2, value3 +FROM delta_scan('./data/generated/test_file_skipping/varchar/delta_lake') +WHERE + value1 = '2' and + value2 = '2' and + value3 = '2' +---- +analyzed_plan :.*File Filters:.* \ No newline at end of file