Skip to content

Commit

Permalink
Merge pull request #76 from samansmink/feature
Browse files Browse the repository at this point in the history
Update feature branch
  • Loading branch information
samansmink authored Aug 30, 2024
2 parents 0b98197 + bdd8f26 commit dfee8b3
Show file tree
Hide file tree
Showing 297 changed files with 2,639 additions and 164 deletions.
12 changes: 9 additions & 3 deletions .github/workflows/CloudTesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ jobs:
with:
vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6

- name: Configure OpenSSL for Rust
run: |
echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_USE_STATIC_LIBS=true" >> $GITHUB_ENV
- name: Setup Rust
uses: dtolnay/rust-toolchain@stable

Expand All @@ -57,15 +63,15 @@ jobs:
AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}}
AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}}
run: |
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*"
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest `pwd`/test/sql/cloud/*
- name: Test with SPN logged in in azure-cli
env:
AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}}
DUCKDB_AZ_CLI_LOGGED_IN: 1
run: |
az login --service-principal -u ${{secrets.AZURE_CLIENT_ID}} -p ${{secrets.AZURE_CLIENT_SECRET}} --tenant ${{secrets.AZURE_TENANT_ID}}
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*"
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest `pwd`/test/sql/cloud/*
- name: Log out azure-cli
if: always()
Expand All @@ -77,4 +83,4 @@ jobs:
AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}}
DUCKDB_AZURE_PUBLIC_CONTAINER_AVAILABLE: 1
run: |
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*"
python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest `pwd`/test/sql/cloud/*
12 changes: 12 additions & 0 deletions .github/workflows/LocalTesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ jobs:
with:
vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6

- name: Configure OpenSSL for Rust
run: |
echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_USE_STATIC_LIBS=true" >> $GITHUB_ENV
- name: Build
shell: bash
run: make
Expand Down Expand Up @@ -194,6 +200,12 @@ jobs:
with:
vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6

- name: Configure OpenSSL for Rust
run: |
echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV
echo "OPENSSL_USE_STATIC_LIBS=true" >> $GITHUB_ENV
- name: Build
shell: bash
run: make generate-data
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
build
benchmark_results
duckdb_benchmark_data/
.idea
cmake-build-debug
duckdb_unittest_tempdir/
Expand Down
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
branch = main
[submodule "extension-ci-tools"]
path = extension-ci-tools
url = git@github.com:duckdb/extension-ci-tools.git
branch = main
url = https://github.com/duckdb/extension-ci-tools.git
branch = main
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ if(APPLE)
elseif(UNIX)
set(PLATFORM_LIBS m c resolv)
elseif(WIN32)
set(PLATFORM_LIBS ntdll ncrypt secur32 ws2_32 userenv bcrypt msvcrt advapi32)
set(PLATFORM_LIBS ntdll ncrypt secur32 ws2_32 userenv bcrypt msvcrt advapi32 RuntimeObject)
else()
message(STATUS "UNKNOWN OS")
endif()
Expand Down Expand Up @@ -99,7 +99,7 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs"
# WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping
# the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix
GIT_TAG ed2b80b127984481adba8e59879f39b9e5f871d1
GIT_TAG v0.2.0
# Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them
# through CMake is an error-prone mess
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env
Expand Down
11 changes: 11 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,20 @@ test_release: export DAT_PATH=./build/release/rust/src/delta_kernel/acceptance/t
test_debug: export DELTA_KERNEL_TESTS_PATH=./build/debug/rust/src/delta_kernel/kernel/tests/data
test_debug: export DAT_PATH=./build/debug/rust/src/delta_kernel/acceptance/tests/dat

# Core extensions that we need for testing
CORE_EXTENSIONS='tpcds;tpch;aws;azure;httpfs'

# Set this flag during building to enable the benchmark runner
ifeq (${BUILD_BENCHMARK}, 1)
TOOLCHAIN_FLAGS:=${TOOLCHAIN_FLAGS} -DBUILD_BENCHMARKS=1
endif

# Include the Makefile from extension-ci-tools
include extension-ci-tools/makefiles/duckdb_extension.Makefile

# Include the Makefile from the benchmark directory
include benchmark/benchmark.Makefile

# Generate some test data to test with
generate-data:
python3 -m pip install delta-spark duckdb pandas deltalake pyspark delta
Expand Down
56 changes: 45 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,40 +1,68 @@
# DuckDB Delta Extension

This is the experimental DuckDB extension for [Delta](https://delta.io/). It is built using the (also experimental)
[Delta Kernel](https://github.com/delta-incubator/delta-kernel-rs). The extension (currently) offers **read** support for delta
tables, both local and remote.

# Supported platforms
## Supported platforms

The supported platforms are:

- `linux_amd64` and `linux_amd64_gcc4` and `linux_arm64`
- `osx_amd64` and `osx_arm64`
- `windows_amd64`

Support for the [other](https://duckdb.org/docs/extensions/working_with_extensions#platforms) DuckDB platforms is
work-in-progress

# How to use
**NOTE: this extension requires the DuckDB v0.10.3 or higher**
## How to use

> [!NOTE]
> This extension requires the DuckDB v0.10.3 or higher
This extension is distributed as a binary extension. To use it, simply use one of its functions from DuckDB and the extension will be autoloaded:

```SQL
FROM delta_scan('s3://some/delta/table');
```

Note that using DuckDB [Secrets](https://duckdb.org/docs/configuration/secrets_manager.html) for S3 authentication is supported:
To scan a local table, use the full path prefixes with `file://`

```SQL
FROM delta_scan('file:///some/path/on/local/machine');
```

## Cloud Storage authentication

Note that using DuckDB [Secrets](https://duckdb.org/docs/configuration/secrets_manager.html) for Cloud authentication is supported.

### S3 Example

```SQL
CREATE SECRET (TYPE S3, provider credential_chain);
CREATE SECRET (
TYPE S3,
PROVIDER CREDENTIAL_CHAIN
);
FROM delta_scan('s3://some/delta/table/with/auth');
```

To scan a local table, use the full path prefixes with `file://`
### Azure Example

```SQL
FROM delta_scan('file:///some/path/on/local/machine');
CREATE SECRET (
TYPE AZURE,
PROVIDER CREDENTIAL_CHAIN,
CHAIN 'cli',
ACCOUNT_NAME 'mystorageaccount'
);
FROM delta_scan('abfss://some/delta/table/with/auth');
```

# Features
## Features

While still experimental, many (scanning) features/optimizations are already supported in this extension as it reuses most of DuckDB's
regular parquet scanning logic:

- multithreaded scans and parquet metadata reading
- data skipping/filter pushdown
- skipping row-groups in file (based on parquet metadata)
Expand All @@ -43,24 +71,30 @@ regular parquet scanning logic:
- scanning tables with deletion vectors
- all primitive types
- structs
- S3 support with secrets
- Cloud storage (AWS, Azure, GCP) support with secrets

More features coming soon!

# Building
## Building

See the [Extension Template](https://github.com/duckdb/extension-template) for generic build instructions

# Running tests
## Running tests

There are various tests available for the delta extension:

1. Delta Acceptence Test (DAT) based tests in `/test/sql/dat`
2. delta-kernel-rs based tests in `/test/sql/delta_kernel_rs`
3. Generated data based tests in `tests/sql/generated` (generated using [delta-rs](https://delta-io.github.io/delta-rs/), [PySpark](https://spark.apache.org/docs/latest/api/python/index.html), and DuckDB)

To run the first 2 sets of tests:

```shell
make test_debug
```

or in release mode

```shell
make test
```
Expand Down
41 changes: 41 additions & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Benchmarking the Delta Extension

## Basics
A primitive benchmarking suite exists for the Delta extension.

To run the benchmarks, firstly run the build using:
```shell
BUILD_BENCHMARK=1 make
```

Then, make sure that the generated data is created using:
```shell
make generate-data
```

Then to run a benchmark, use one of the benchmark Makefile targets prefixed with `bench-run-`:
```shell
make bench-run-tpch-sf1
```
Now the TPCH benchmark will be run twice, once on parquet files and once on a delta table.

To create a plot from the results run:
```shell
make plot
```

## More options
Specific benchmarks can be run from a suite using the `BENCHMARK_PATTERN` variable. For example to compare
only Q01 from TPCH SF1, run:
```shell
BENCHMARK_PATTERN=q01.benchmark make bench-run-tpch-sf1
```

Also, we can run all local benchmarks using:
```shell
make bench-run-all-local
```
Or all remote benchmarks using
```shell
make bench-run-all-remote
```
70 changes: 70 additions & 0 deletions benchmark/benchmark.Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
.PHONY: bench-output-dir clean_benchmark plot

ifeq ("$(BENCHMARK_PATTERN)a", "a")
BENCHMARK_PATTERN:=.*
endif

bench-output-dir:
mkdir -p benchmark_results

clean_benchmark:
rm -rf benchmark_results

plot:
python3 scripts/plot.py


############### BENCHMARK TARGETS ###############

###
# TPCH LOCAL
###

# TPCH SF1 on delta table
bench-run-tpch-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-delta.csv
# TPCH SF1 on parquet files
bench-run-tpch-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-parquet.csv
# TPCH SF1 on duckdb file
bench-run-tpch-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-duckdb.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet

###
# TPCH REMOTE
###

# TPCH on remote delta table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-delta.csv
# TPCH on remote parquet table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-parquet.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-remote-delta

###
# TPCDS LOCAL
###

# TPCDS SF1 on delta table
bench-run-tpcds-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-delta.csv
# TPCDS SF1 on parquet files
bench-run-tpcds-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-parquet.csv
# TPCDS SF1 on duckdb files
bench-run-tpcds-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpcds/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-duckdb.csv

# COMPARES TPCDS SF1 on parquet file vs on delta files
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet

###
# ALL
###
bench-run-all-local: bench-run-tpcds-sf1 bench-run-tpch-sf1

bench-run-all-remote: bench-run-tpch-sf1-remote
24 changes: 24 additions & 0 deletions benchmark/tpcds/sf1-delta/load.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
create view call_center as from delta_scan('./data/generated/tpcds_sf1/call_center/delta_lake');
create view catalog_page as from delta_scan('./data/generated/tpcds_sf1/catalog_page/delta_lake');
create view catalog_returns as from delta_scan('./data/generated/tpcds_sf1/catalog_returns/delta_lake');
create view catalog_sales as from delta_scan('./data/generated/tpcds_sf1/catalog_sales/delta_lake');
create view customer as from delta_scan('./data/generated/tpcds_sf1/customer/delta_lake');
create view customer_demographics as from delta_scan('./data/generated/tpcds_sf1/customer_demographics/delta_lake');
create view customer_address as from delta_scan('./data/generated/tpcds_sf1/customer_address/delta_lake');
create view date_dim as from delta_scan('./data/generated/tpcds_sf1/date_dim/delta_lake');
create view household_demographics as from delta_scan('./data/generated/tpcds_sf1/household_demographics/delta_lake');
create view inventory as from delta_scan('./data/generated/tpcds_sf1/inventory/delta_lake');
create view income_band as from delta_scan('./data/generated/tpcds_sf1/income_band/delta_lake');
create view item as from delta_scan('./data/generated/tpcds_sf1/item/delta_lake');
create view promotion as from delta_scan('./data/generated/tpcds_sf1/promotion/delta_lake');
create view reason as from delta_scan('./data/generated/tpcds_sf1/reason/delta_lake');
create view ship_mode as from delta_scan('./data/generated/tpcds_sf1/ship_mode/delta_lake');
create view store as from delta_scan('./data/generated/tpcds_sf1/store/delta_lake');
create view store_returns as from delta_scan('./data/generated/tpcds_sf1/store_returns/delta_lake');
create view store_sales as from delta_scan('./data/generated/tpcds_sf1/store_sales/delta_lake');
create view time_dim as from delta_scan('./data/generated/tpcds_sf1/time_dim/delta_lake');
create view warehouse as from delta_scan('./data/generated/tpcds_sf1/warehouse/delta_lake');
create view web_page as from delta_scan('./data/generated/tpcds_sf1/web_page/delta_lake');
create view web_returns as from delta_scan('./data/generated/tpcds_sf1/web_returns/delta_lake');
create view web_sales as from delta_scan('./data/generated/tpcds_sf1/web_sales/delta_lake');
create view web_site as from delta_scan('./data/generated/tpcds_sf1/web_site/delta_lake');
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1-delta/q01.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/q01.benchmark
# description: Run query 01 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1-delta/tpcds_sf1.benchmark.in
QUERY_NUMBER=1
QUERY_NUMBER_PADDED=01
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1-delta/q02.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/q02.benchmark
# description: Run query 02 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1-delta/tpcds_sf1.benchmark.in
QUERY_NUMBER=2
QUERY_NUMBER_PADDED=02
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1-delta/q03.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/q03.benchmark
# description: Run query 03 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1-delta/tpcds_sf1.benchmark.in
QUERY_NUMBER=3
QUERY_NUMBER_PADDED=03
Loading

0 comments on commit dfee8b3

Please sign in to comment.