diff --git a/.github/workflows/Rest.yml b/.github/workflows/Rest.yml new file mode 100644 index 0000000..462fe9d --- /dev/null +++ b/.github/workflows/Rest.yml @@ -0,0 +1,64 @@ +name: Rest +on: [pull_request, repository_dispatch] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + cancel-in-progress: true +defaults: + run: + shell: bash + +jobs: + rest: + name: Test against Rest Catalog + runs-on: ubuntu-latest + env: + VCPKG_TARGET_TRIPLET: 'x64-linux' + GEN: Ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + + steps: + - name: Install required ubuntu packages + run: | + sudo apt-get update -y -qq + sudo apt-get install -y -qq software-properties-common + sudo add-apt-repository ppa:git-core/ppa + sudo apt-get update -y -qq + sudo apt-get install -y -qq ninja-build make gcc-multilib g++-multilib libssl-dev wget openjdk-8-jdk zip maven unixodbc-dev libc6-dev-i386 lib32readline6-dev libssl-dev libcurl4-gnutls-dev libexpat1-dev gettext unzip build-essential checkinstall libffi-dev curl libz-dev openssh-client + sudo apt-get install -y -qq tar pkg-config + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + + - name: Install Git 2.18.5 + run: | + wget https://github.com/git/git/archive/refs/tags/v2.18.5.tar.gz + tar xvf v2.18.5.tar.gz + cd git-2.18.5 + make + sudo make prefix=/usr install + git --version + + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: 'true' + + - name: Setup ManyLinux2014 + run: | + ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1 + + - name: Build extension + env: + GEN: ninja + STATIC_LIBCPP: 1 + run: | + make release + + - name: Start Rest Catalog + working-directory: scripts/ + run: | + ./start-rest-catalog.sh diff --git a/.gitignore b/.gitignore index aba8987..eccf73f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ testext test/python/__pycache__/ .Rhistory test/sql/tmp.test -data/iceberg/generated_* \ No newline at end of file +data/iceberg/generated_* +scripts/metastore_db/ +scripts/derby.log +scripts/test-script-with-path.sql \ No newline at end of file diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml new file mode 100644 index 0000000..6a14e64 --- /dev/null +++ b/scripts/docker-compose.yml @@ -0,0 +1,52 @@ +version: "3" + +services: + rest: + image: tabulario/iceberg-rest + container_name: iceberg-rest + networks: + iceberg_net: + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + minio: + image: minio/minio + container_name: minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_net: + aliases: + - warehouse.minio + ports: + - 9001:9001 + - 9000:9000 + command: ["server", "/data", "--console-address", ":9001"] + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_net: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + " +networks: + iceberg_net: \ No newline at end of file diff --git a/scripts/provision.py b/scripts/provision.py new file mode 100644 index 0000000..a71883b --- /dev/null +++ b/scripts/provision.py @@ -0,0 +1,153 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyspark.sql import SparkSession + +import os + +os.environ[ + "PYSPARK_SUBMIT_ARGS" +] = "--packages org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.2,org.apache.iceberg:iceberg-aws-bundle:1.4.2 pyspark-shell" +os.environ["AWS_REGION"] = "us-east-1" +os.environ["AWS_ACCESS_KEY_ID"] = "admin" +os.environ["AWS_SECRET_ACCESS_KEY"] = "password" + +spark = ( + SparkSession.builder.appName("DuckDB REST Integeration test") + .config( + "spark.sql.extensions", + "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + ) + .config("spark.sql.catalog.demo", "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.demo.type", "rest") + .config("spark.sql.catalog.demo.uri", "http://127.0.0.1:8181") + .config("spark.sql.catalog.demo.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") + .config("spark.sql.catalog.demo.warehouse", "s3://warehouse/wh/") + .config("spark.sql.catalog.demo.s3.endpoint", "http://127.0.0.1:9000") + .config("spark.sql.catalog.demo.s3.path-style-access", "true") + .config("spark.sql.defaultCatalog", "demo") + .config("spark.sql.catalogImplementation", "in-memory") + .getOrCreate() +) + +spark.sql( + """ + CREATE DATABASE IF NOT EXISTS default; +""" +) + +spark.sql( + """ +CREATE OR REPLACE TABLE default.table_unpartitioned ( + dt date, + number integer, + letter string +) +USING iceberg +""" +) + +spark.sql( + """ + INSERT INTO default.table_unpartitioned + VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); + """ +) + + +spark.sql( + """ +CREATE OR REPLACE TABLE default.table_partitioned ( + dt date, + number integer, + letter string +) +USING iceberg +PARTITIONED BY (days(dt)) +""" +) + +spark.sql( + """ + INSERT INTO default.table_partitioned + VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); + """ +) + +# By default, Spark uses merge on write deletes +# which optimize for read-performance + +spark.sql( + """ +CREATE OR REPLACE TABLE default.table_mor_deletes ( + dt date, + number integer, + letter string +) +USING iceberg +TBLPROPERTIES ( + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read', + 'format-version'='2' +); +""" +) + + +spark.sql( + """ + INSERT INTO default.table_mor_deletes + VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); + """ +) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..67f2796 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +pyspark==3.4.1 \ No newline at end of file diff --git a/scripts/start-rest-catalog.sh b/scripts/start-rest-catalog.sh new file mode 100755 index 0000000..82854ec --- /dev/null +++ b/scripts/start-rest-catalog.sh @@ -0,0 +1,37 @@ + +set -ex + +docker-compose kill +docker-compose rm -f +docker-compose up -d +docker-compose logs -f mc + +pip3 install -r requirements.txt + +python3 provision.py + +# Would be nice to have rest support in there :) +UNPARTITIONED_TABLE_PATH=$(curl -s http://127.0.0.1:8181/v1/namespaces/default/tables/table_unpartitioned | jq -r '."metadata-location"') + +SQL=$(cat <<-END +INSTALL iceberg; +LOAD iceberg; + +SET s3_access_key_id='admin'; +SET s3_secret_access_key='password'; +SET s3_endpoint='127.0.0.1:9000'; +SET s3_url_style='path'; +SET s3_use_ssl=false; + +SELECT * FROM iceberg_scan('${UNPARTITIONED_TABLE_PATH}'); +END + +) + +if test -f "../build/release/duckdb" +then + # in CI + ../build/release/duckdb -s "$SQL" +else + duckdb -s "$SQL" +fi