CrunchyData · aykut-bozkurt · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -1,3 +1,6 @@
 [target.'cfg(target_os="macos")']
 # Postgres symbols won't be available until runtime
 rustflags = ["-Clink-arg=-Wl,-undefined,dynamic_lookup"]
+
+[net]
+git-fetch-with-cli = true
diff --git a/.env_sample b/.env_sample
@@ -0,0 +1,5 @@
+AWS_S3_TEST_BUCKET=testbucket
+AWS_REGION=us-east-1
+AWS_ACCESS_KEY_ID=admin
+AWS_SECRET_ACCESS_KEY=admin123
+PG_PARQUET_TEST=true
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,110 @@
+name: CI lints and tests
+on:
+  push:
+    branches:
+      - "*"
+
+concurrency: 
+  group: ${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  AWS_ACCESS_KEY_ID: test_secret_access_key
+  AWS_SECRET_ACCESS_KEY: test_access_key_id
+  AWS_REGION: us-east-1
+  AWS_S3_TEST_BUCKET: testbucket
+  PG_PARQUET_TEST: true
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          toolchain: 1.81.0
+          target: x86_64-unknown-linux-gnu
+          components: rustfmt, clippy
+
+      - name: Install cargo-llvm-cov for coverage report
+        run: cargo install --locked [email protected]
+
+      - name: Install PostgreSQL
+        run: |
+          sudo sh -c 'echo "deb https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+          wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+          sudo apt-get update
+          sudo apt-get install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libxslt-dev libssl-dev libxml2-utils xsltproc ccache pkg-config
+          sudo apt-get -y install postgresql-16-postgis-3 libpq-dev postgresql-server-dev-16 postgresql-client-16
+
+      - name: Install MinIO
+        run: |
+          # Download and install MinIO server and client
+          wget https://dl.min.io/server/minio/release/linux-amd64/$MINIO_VERSION
+          chmod +x $MINIO_VERSION
+          mv $MINIO_VERSION /usr/local/bin/minio
+          echo "$MINIO_SHA256  /usr/local/bin/minio" | sha256sum --check
+
+          # Download and install MinIO admin
+          wget https://dl.min.io/client/mc/release/linux-amd64/$MINIO_ADMIN_VERSION
+          chmod +x $MINIO_ADMIN_VERSION
+          mv $MINIO_ADMIN_VERSION /usr/local/bin/mc
+          echo "$MINIO_ADMIN_SHA256  /usr/local/bin/mc" | sha256sum --check
+        env:
+          MINIO_VERSION: "minio.RELEASE.2024-09-22T00-33-43Z"
+          MINIO_SHA256: "dea08573980057d84c14d5c55926e10b91fb2993a99696ff136fb0bddaa7c98f"
+          MINIO_ADMIN_VERSION: "mc.RELEASE.2024-09-16T17-43-14Z"
+          MINIO_ADMIN_SHA256: "9a9e7d32c175f2804d6880d5ad3623097ea439f0e0304aa6039874d0f0c493d8"
+
+      - name: Install and configure pgrx
+        run: |
+          cargo install --locked [email protected]
+          cargo pgrx init --pg16 $(which pg_config)
+
+      - name: Format and lint
+        run: |
+          cargo fmt --all -- --check
+          cargo clippy --all-targets --all-features -- -D warnings
+
+      # pgrx tests with runas argument ignores environment variables,
+      # so we need to create a .env file beforehand
+      - name: Create .env file
+        run: |
+          touch /tmp/.env
+          echo AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }} >> /tmp/.env
+          echo AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }} >> /tmp/.env
+          echo AWS_REGION=${{ env.AWS_REGION }} >> /tmp/.env
+          echo AWS_S3_TEST_BUCKET=${{ env.AWS_S3_TEST_BUCKET }} >> /tmp/.env
+          echo PG_PARQUET_TEST=${{ env.PG_PARQUET_TEST }} >> /tmp/.env
+
+      - name: Run tests
+        run: |
+          # Start MinIO server
+          export MINIO_ROOT_USER=${{ env.AWS_ACCESS_KEY_ID }}
+          export MINIO_ROOT_PASSWORD=${{ env.AWS_SECRET_ACCESS_KEY }}
+          minio server /tmp/minio-storage > /dev/null 2>&1 &
+
+          # Set access key and create test bucket 
+          mc alias set local http://localhost:9000 ${{ env.AWS_ACCESS_KEY_ID }} ${{ env.AWS_SECRET_ACCESS_KEY }}
+          aws --endpoint-url http://localhost:9000 s3 mb s3://${{ env.AWS_S3_TEST_BUCKET }}
+
+          # Run tests with coverage tool
+          cargo llvm-cov test --lcov --output-path lcov.info
+
+          # Stop MinIO server
+          pkill -9 minio
+        env:
+          RUST_TEST_THREADS: 1
+          CARGO_PGRX_TEST_RUNAS: postgres
+          CARGO_PGRX_TEST_PGDATA: /tmp/pgdata
+
+      - name: Upload coverage report to Codecov
+        uses: codecov/codecov-action@v4
+        with:
+          fail_ci_if_error: false
+          files: ./lcov.info
+          flags: pgrxtests
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,14 @@
 *.iml
 **/*.rs.bk
 Cargo.lock
+*.parquet
+**/*.code-workspace
+*.csv
+*.profraw
+*.profdata
+*.gcov
+*.lcov
+*.xml
+lcov.info
+.env
+playground.rs
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,4 +1,7 @@
 {
     "editor.formatOnSave": true,
-    "editor.defaultFormatter": "rust-lang.rust-analyzer"
+    "editor.defaultFormatter": "rust-lang.rust-analyzer",
+    "rust-analyzer.check.command": "clippy",
+    "rust-analyzer.checkOnSave": true,
+    "editor.inlayHints.enabled": "offUnlessPressed",
 }
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,35 +1,45 @@
 [package]
 name = "pg_parquet"
-version = "0.0.0"
+version = "0.1.0"
 edition = "2021"
 
 [lib]
-crate-type = ["cdylib"]
+crate-type = ["cdylib","lib"]
+
+[[bin]]
+name = "pgrx_embed_pg_parquet"
+path = "./src/bin/pgrx_embed.rs"
 
 [features]
 default = ["pg16"]
-# pg11 = ["pgrx/pg11", "pgrx-tests/pg11" ]
-# pg12 = ["pgrx/pg12", "pgrx-tests/pg12" ]
-# pg13 = ["pgrx/pg13", "pgrx-tests/pg13" ]
-# pg14 = ["pgrx/pg14", "pgrx-tests/pg14" ]
-# pg15 = ["pgrx/pg15", "pgrx-tests/pg15" ]
 pg16 = ["pgrx/pg16", "pgrx-tests/pg16" ]
 pg_test = []
 
 [dependencies]
-parquet = { version = "52.0.0", features = [
+arrow = {version = "53", default-features = false}
+arrow-schema = {version = "53", default-features = false}
+aws-config = { version = "1.5", default-features = false, features = ["rustls"]}
+aws-credential-types = {version = "1.2", default-features = false}
+dotenvy = "0.15"
+futures = "0.3"
+object_store = {version = "0.11", default-features = false, features = ["aws"]}
+once_cell = "1"
+parquet = {version = "53", default-features = false, features = [
+    "arrow",
     "snap",
     "brotli",
     "flate2",
     "lz4",
     "zstd",
-    "base64"
+    "object_store",
 ]}
-parquet_derive = "52.0.0"
-pgrx = "=0.11.4"
+pgrx = "=0.12.4"
+serde = {version = "1", default-features = false}
+serde_json = "1"
+tokio = {version = "1", default-features = false, features = ["rt", "time", "macros"]}
 
 [dev-dependencies]
-pgrx-tests = "=0.11.4"
+pgrx-tests = "=0.12.4"
 
 [profile.dev]
 panic = "unwind"

diff --git a/Makefile b/Makefile
@@ -0,0 +1,27 @@
+.PHONY: cargo-exists cargo-pgrx-exists pg_parquet install clean check uninstall
+
+PG_CONFIG ?= pg_config
+
+all: pg_parquet
+
+cargo-exists:
+       @which cargo > /dev/null || (echo "cargo is not available. Please install cargo." && exit 1)
+
+cargo-pgrx-exists: cargo-exists
+       @which cargo-pgrx > /dev/null || (echo "cargo-pgrx is not available. Please install cargo-pgrx." && exit 1)
+
+pg_parquet: cargo-pgrx-exists
+       cargo build --release --features pg16
+
+install: pg_parquet
+       cargo pgrx install --release --features pg16
+
+clean: cargo-exists
+       cargo clean
+
+check: cargo-pgrx-exists
+       RUST_TEST_THREADS=1 cargo pgrx test pg16
+
+uninstall:
+       rm -f $(shell $(PG_CONFIG) --pkglibdir)/pg_parquet.so
+       rm -f $(shell $(PG_CONFIG) --sharedir)/extension/pg_parquet*
diff --git a/README.md b/README.md
@@ -0,0 +1,158 @@
+# pg_parquet
+
+> Copy from/to Parquet files in PostgreSQL!
+
+[![CI lints and tests](https://github.com/aykut-bozkurt/pg_parquet/actions/workflows/ci.yml/badge.svg)](https://github.com/aykut-bozkurt/pg_parquet/actions/workflows/ci.yml)
+[![codecov](https://codecov.io/gh/aykut-bozkurt/pg_parquet/graph/badge.svg?token=SVDGPEAP51)](https://codecov.io/gh/aykut-bozkurt/pg_parquet)
+
+`pg_parquet` is a PostgreSQL extension that allows you to read and write Parquet files, which are located in `S3` or `file system`, from PostgreSQL via `COPY TO/FROM` commands. It heavily uses [Apache Arrow](https://arrow.apache.org/rust/arrow/) project to read and write Parquet files and [pgrx](https://github.com/pgcentralfoundation/pgrx) project to extend PostgreSQL's `COPY` command.
+
+## Quick Reference
+- [Installation From Source](#installation-from-source)
+- [Usage](#usage)
+  - [Copy FROM/TO Parquet files TO/FROM Postgres tables](#copy-tofrom-parquet-files-fromto-postgres-tables)
+  - [Inspect Parquet schema](#inspect-parquet-schema)
+  - [Inspect Parquet metadata](#inspect-parquet-metadata)
+- [Object Store Support](#object-store-support)
+- [Copy Options](#copy-options)
+- [Configuration](#configuration)
+- [Supported Types](#supported-types)
+  - [Nested Types](#nested-types)
+
+## Installation From Source
+After installing `Postgres`, you need to set up `rustup`, `cargo-pgrx` to build the extension.
+
+```bash
+# install rustup
+> curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+# install cargo-pgrx
+> cargo install cargo-pgrx
+
+# configure pgrx
+> cargo pgrx init --pg16 $(which pg_config)
+
+# append the extension to shared_preload_libraries in ~/.pgrx/data-16/postgresql.conf 
+> echo "shared_preload_libraries = 'pg_parquet'" >> ~/.pgrx/data-16/postgresql.conf
+
+# run cargo-pgrx to build and install the extension
+> cargo pgrx run
+
+# create the extension in the database
+psql> "CREATE EXTENSION pg_parquet;"
+```
+
+## Usage
+There are mainly 3 things that you can do with `pg_parquet`:
+1. You can export Postgres tables/queries to Parquet files,
+2. You can ingest data from Parquet files to Postgres tables,
+3. You can inspect the schema and metadata of Parquet files.
+
+### COPY to/from Parquet files from/to Postgres tables
+You can use PostgreSQL's `COPY` command to read and write Parquet files. Below is an example of how to write a PostgreSQL table, with complex types, into a Parquet file and then to read the Parquet file content back into the same table.
+
+```sql
+-- create composite types
+CREATE TYPE product_item AS (id INT, name TEXT, price float4);
+CREATE TYPE product AS (id INT, name TEXT, items product_item[]);
+
+-- create a table with complex types
+CREATE TABLE product_example (
+    id int,
+    product product,
+    products product[],
+    created_at TIMESTAMP,
+    updated_at TIMESTAMPTZ
+);
+
+-- insert some rows into the table
+insert into product_example values (
+    1,
+    ROW(1, 'product 1', ARRAY[ROW(1, 'item 1', 1.0), ROW(2, 'item 2', 2.0), NULL]::product_item[])::product,
+    ARRAY[ROW(1, NULL, NULL)::product, NULL],
+    now(),
+    '2022-05-01 12:00:00-04'
+);
+
+-- copy the table to a parquet file
+COPY product_example TO '/tmp/product_example.parquet' (FORMAT 'parquet', COMPRESSION 'gzip');
+
+-- show table
+SELECT * FROM product_example;
+
+-- copy the parquet file to the table
+COPY product_example FROM '/tmp/product_example.parquet';
+
+-- show table
+SELECT * FROM product_example;
+```
+
+### Inspect Parquet schema
+You can call `SELECT * FROM parquet.schema(<uri>)` to discover the schema of the Parquet file at given uri.
+
+### Inspect Parquet metadata
+You can call `SELECT * FROM parquet.metadata(<uri>)` to discover the detailed metadata of the Parquet file, such as column statistics, at given uri.
+
+You can call `SELECT * FROM parquet.file_metadata(<uri>)` to discover file level metadata of the Parquet file, such as format version, at given uri.
+
+You can call `SELECT * FROM parquet.kv_metadata(<uri>)` to query custom key-value metadata of the Parquet file at given uri.
+
+## Object Store Support
+`pg_parquet` supports reading and writing Parquet files from/to `S3` object store. Only the uris with `s3://` scheme is supported.
+
+You can either set the following environment variables or use shared configuration files to access to the object store:
+- `AWS_ACCESS_KEY_ID`: the access key ID of the AWS account,
+- `AWS_SECRET_ACCESS_KEY`: the secret access key of the AWS account,
+- `AWS_REGION`: the default region of the AWS account.
+
+You can set config file path with `AWS_CONFIG_FILE` environment variable. The default config file path is `~/.aws/config`. You can also set profile name with `AWS_PROFILE` environment variable. The default profile name is `default`.
+
+## Copy Options
+`pg_parquet` supports the following options in the `COPY TO` command:
+- `format parquet`: you need to specify this option to read or write Parquet files which does not end with `.parquet[.<compression>]` extension. (This is the only option that `COPY FROM` command supports.),
+- `row_group_size <size>`: the number of rows in each row group while writing Parquet files. The default row group size is `100000`,
+- `compression <compression>`: the compression format to use while writing Parquet files. The supported compression formats are `uncompressed`, `snappy`, `gzip`, `brotli`, `lz4`, `lz4raw` and `zstd`. The default compression format is `uncompressed`. If not specified, the compression format is determined by the file extension.
+
+## Configuration
+There is currently only one GUC parameter to enable/disable the `pg_parquet`:
+- `pg_parquet.enable_copy_hooks`: you can set this parameter to `on` or `off` to enable or disable the `pg_parquet` extension. The default value is `on`.
+
+## Supported Types
+`pg_parquet` has rich type support, including PostgreSQL's primitive, array, and composite types. Below is the table of the supported types in PostgreSQL and their corresponding Parquet types.
+
+| PostgreSQL Type   | Parquet Physical Type     | Logical Type     |
+|-------------------|---------------------------|------------------|
+| `bool`            | BOOLEAN                   |                  |
+| `smallint`        | INT16                     |                  |
+| `integer`         | INT32                     |                  |
+| `bigint`          | INT64                     |                  |
+| `real`            | FLOAT                     |                  |
+| `oid`             | INT32                     |                  |
+| `double`          | DOUBLE                    |                  |
+| `numeric`(1)      | FIXED_LEN_BYTE_ARRAY(16)  | DECIMAL(128)     |
+| `text`            | BYTE_ARRAY                | STRING           |
+| `json`            | BYTE_ARRAY                | STRING           |
+| `bytea`           | BYTE_ARRAY                |                  |
+| `date` (2)        | INT32                     | DATE             |
+| `timestamp`       | INT64                     | TIMESTAMP_MICROS |
+| `timestamptz` (3) | INT64                     | TIMESTAMP_MICROS |
+| `time`            | INT64                     | TIME_MICROS      |
+| `timetz`(3)       | INT64                     | TIME_MICROS      |
+| `geometry`(4)     | BYTE_ARRAY                |                  |
+
+### Nested Types
+| PostgreSQL Type   | Parquet Physical Type     | Logical Type     |
+|-------------------|---------------------------|------------------|
+| `composite`       | GROUP                     | STRUCT           |
+| `array`           | element's physical type   | LIST             |
+| `crunchy_map`(5)          | GROUP                     | MAP              |
+
+> [!WARNING]
+> - (1) The `numeric` types with <= `38` precision is represented as `FIXED_LEN_BYTE_ARRAY(16)` with `DECIMAL(128)` logical type. The `numeric` types with > `38` precision is represented as `BYTE_ARRAY` with `STRING` logical type.
+> - (2) The `date` type is represented according to `Unix epoch` when writing to Parquet files. It is converted back according to `PostgreSQL epoch` when reading from Parquet files.
+> - (3) The `timestamptz` and `timetz` types are adjusted to `UTC` when writing to Parquet files. They are converted back with `UTC` timezone when reading from Parquet files.
+> - (4) The `geometry` type is represented as `BYTE_ARRAY` encoded as `WKB` when `postgis` extension is created. Otherwise, it is represented as `BYTE_ARRAY` with `STRING` logical type.
+> - (5) The `crunchy_map` type is represented as `GROUP` with `MAP` logical type when `crunchy_map` extension is created. Otherwise, it is represented as `BYTE_ARRAY` with `STRING` logical type.
+
+> [!WARNING]
+> Any type that does not have a corresponding Parquet type will be represented, as a fallback mechanism, as `BYTE_ARRAY` with `STRING` logical type. e.g. `enum`
diff --git a/pg_parquet.control b/pg_parquet.control
@@ -1,4 +1,4 @@
-comment = 'pg_parquet:  Created by pgrx'
+comment = 'copy data between Postgres and Parquet'
 default_version = '@CARGO_VERSION@'
 module_pathname = '$libdir/pg_parquet'
 relocatable = false