Skip to content

Commit

Permalink
Merge pull request #77 from samansmink/add-stats
Browse files Browse the repository at this point in the history
Add stats
  • Loading branch information
samansmink authored Sep 2, 2024
2 parents dfee8b3 + 8e4f623 commit 3933ebd
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 17 deletions.
19 changes: 6 additions & 13 deletions benchmark/benchmark.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,10 @@ plot:

# TPCH SF1 on delta table
bench-run-tpch-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta.csv
# TPCH SF1 on parquet files
bench-run-tpch-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-parquet.csv
# TPCH SF1 on duckdb file
bench-run-tpch-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-duckdb.csv
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-parquet.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet

Expand All @@ -38,10 +35,10 @@ bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet

# TPCH on remote delta table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-delta.csv
# TPCH on remote parquet table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-parquet.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-parquet.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-remote-delta

Expand All @@ -51,14 +48,10 @@ bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-

# TPCDS SF1 on delta table
bench-run-tpcds-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta.csv
# TPCDS SF1 on parquet files
bench-run-tpcds-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-parquet.csv
# TPCDS SF1 on duckdb files
bench-run-tpcds-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpcds/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-duckdb.csv

./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-parquet.csv
# COMPARES TPCDS SF1 on parquet file vs on delta files
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet

Expand Down
17 changes: 14 additions & 3 deletions scripts/plot.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import duckdb
import argparse

### Parse script parameters
parser = argparse.ArgumentParser(description='Plot the results in ./benchmark_results')
parser.add_argument('-p','--pattern', help='Pattern to match result csv files to', required=False, default='*.csv')
parser.add_argument('-w','--width', help='Width of graph, adjust to fit data', required=False, default=20)
args = vars(parser.parse_args())

### Parse Query Results
parse_benchmark_result_query = """
parse_benchmark_result_query = f"""
SELECT
parse_filename(name, true) as benchmark,
parse_filename(filename, true) as config,
avg(timing) as timing
FROM
read_csv('benchmark_results/*.csv', filename=1)
read_csv('benchmark_results/{args['pattern']}', filename=1, columns = {{
'name': 'VARCHAR',
'run': 'BIGINT',
'timing': 'double'
}})
GROUP BY
config,
benchmark
Expand All @@ -22,6 +33,6 @@
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams["figure.figsize"] = [10, 5]
plt.rcParams["figure.figsize"] = [int(args['width']), 5]
fig = benchmark_results.pivot(index='benchmark', columns='config', values='timing').plot(kind='bar', title='', ylabel='runtime [s]').get_figure()
fig.savefig('benchmark_results/result.png')
19 changes: 18 additions & 1 deletion src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ string url_decode(string input) {
return result;
}

static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
auto context = (DeltaSnapshot *) engine_context;
auto path_string = context->GetPath();
StringUtil::RTrim(path_string, "/");
Expand All @@ -63,6 +63,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel
// Initialize the file metadata
context->metadata.back()->delta_snapshot_version = context->version;
context->metadata.back()->file_number = context->resolved_files.size() - 1;
context->metadata.back()->cardinality = stats->num_records;

// Fetch the deletion vector
auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get());
Expand Down Expand Up @@ -493,6 +494,22 @@ idx_t DeltaSnapshot::GetTotalFileCount() {
return resolved_files.size();
}

unique_ptr<NodeStatistics> DeltaSnapshot::GetCardinality(ClientContext &context) {
// This also ensures all files are expanded
auto total_file_count = DeltaSnapshot::GetTotalFileCount();

if (total_file_count == 0) {
return make_uniq<NodeStatistics>(0,0);
}

idx_t total_tuple_count = 0;
for (auto &metadatum : metadata) {
total_tuple_count += metadatum->cardinality;
}

return make_uniq<NodeStatistics>(total_tuple_count,total_tuple_count);
}

unique_ptr<MultiFileReader> DeltaMultiFileReader::CreateInstance() {
return std::move(make_uniq<DeltaMultiFileReader>());
}
Expand Down
3 changes: 3 additions & 0 deletions src/include/functions/delta_scan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct DeltaFileMetaData {

idx_t delta_snapshot_version = DConstants::INVALID_INDEX;
idx_t file_number = DConstants::INVALID_INDEX;
idx_t cardinality = DConstants::INVALID_INDEX;
ffi::KernelBoolSlice selection_vector = {nullptr, 0};
case_insensitive_map_t<string> partition_map;
};
Expand All @@ -49,6 +50,8 @@ struct DeltaSnapshot : public MultiFileList {
FileExpandResult GetExpandResult() override;
idx_t GetTotalFileCount() override;

unique_ptr<NodeStatistics> GetCardinality(ClientContext &context) override;

protected:
//! Get the i-th expanded file
string GetFile(idx_t i) override;
Expand Down
19 changes: 19 additions & 0 deletions test/sql/dat/basic_append.test
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
5

# Cardinality estimation should correctly show this
query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
physical_plan <REGEX>:.*5 Rows.*

query I
SELECT count(number)
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
Expand Down Expand Up @@ -78,3 +84,16 @@ SELECT a_float, number, letter
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----

# Filters are reflected in cardinality estimation: filtering out all files shows 0 EC
query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----
physical_plan <REGEX>:.*0 Rows.*

query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 4
----
physical_plan <REGEX>:.*1 Rows.*
5 changes: 5 additions & 0 deletions test/sql/delta_kernel_rs/basic_partitioned.test
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ e 5 5.5
a 1 1.1
b 2 2.2
c 3 3.3

query II
EXPLAIN FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned')
----
physical_plan <REGEX>:.*6 Rows.*

0 comments on commit 3933ebd

Please sign in to comment.