From 26b6d489a631a73f7a5d33f94da90c51aa907c27 Mon Sep 17 00:00:00 2001 From: aykut-bozkurt <51649454+aykut-bozkurt@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:36:31 +0300 Subject: [PATCH 1/3] Make default numeric scale 9 (#86) To give the integral part more room, we decided to make the default scale = 9 when the user does not specify precision and scale for numeric type. Multiplication of 2 numeric doubles the result scale and gives the integral part less digits. It is wiser to make it smaller but still useful number. --- README.md | 2 +- src/pgrx_tests/copy_type_roundtrip.rs | 4 ++-- src/type_compat/pg_arrow_type_conversions.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 74fdae7..aafc2f5 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ There is currently only one GUC parameter to enable/disable the `pg_parquet`: > * `numeric(9 < P <= 18, S)` is represented as `INT64` with `DECIMAL` logical type > * `numeric(18 < P <= 38, S)` is represented as `FIXED_LEN_BYTE_ARRAY(9-16)` with `DECIMAL` logical type > * `numeric(38 < P, S)` is represented as `BYTE_ARRAY` with `STRING` logical type -> * `numeric` is allowed by Postgres. (precision and scale not specified). These are represented by a default precision (38) and scale (16) instead of writing them as string. You get runtime error if your table tries to read or write a numeric value which is not allowed by the default precision and scale (22 integral digits before decimal point, 16 digits after decimal point). +> * `numeric` is allowed by Postgres. (precision and scale not specified). These are represented by a default precision (38) and scale (9) instead of writing them as string. You get runtime error if your table tries to read or write a numeric value which is not allowed by the default precision and scale (29 integral digits before decimal point, 9 digits after decimal point). > - (2) The `date` type is represented according to `Unix epoch` when writing to Parquet files. It is converted back according to `PostgreSQL epoch` when reading from Parquet files. > - (3) The `timestamptz` and `timetz` types are adjusted to `UTC` when writing to Parquet files. They are converted back with `UTC` timezone when reading from Parquet files. > - (4) The `geometry` type is represented as `BYTE_ARRAY` encoded as `WKB` when `postgis` extension is created. Otherwise, it is represented as `BYTE_ARRAY` with `STRING` logical type. diff --git a/src/pgrx_tests/copy_type_roundtrip.rs b/src/pgrx_tests/copy_type_roundtrip.rs index 2abc4f5..a5af2b1 100644 --- a/src/pgrx_tests/copy_type_roundtrip.rs +++ b/src/pgrx_tests/copy_type_roundtrip.rs @@ -863,7 +863,7 @@ mod tests { #[pg_test] #[should_panic( - expected = "numeric value contains 23 digits before decimal point, which exceeds max allowed integral digits 22 during copy to parquet" + expected = "numeric value contains 30 digits before decimal point, which exceeds max allowed integral digits 29 during copy to parquet" )] fn test_invalid_unbounded_numeric_integral_digits() { let invalid_integral_digits = @@ -879,7 +879,7 @@ mod tests { #[pg_test] #[should_panic( - expected = "numeric value contains 17 digits after decimal point, which exceeds max allowed decimal digits 16 during copy to parquet" + expected = "numeric value contains 10 digits after decimal point, which exceeds max allowed decimal digits 9 during copy to parquet" )] fn test_invalid_unbounded_numeric_decimal_digits() { let invalid_decimal_digits = DEFAULT_UNBOUNDED_NUMERIC_SCALE + 1; diff --git a/src/type_compat/pg_arrow_type_conversions.rs b/src/type_compat/pg_arrow_type_conversions.rs index dec5ee1..3022fd5 100644 --- a/src/type_compat/pg_arrow_type_conversions.rs +++ b/src/type_compat/pg_arrow_type_conversions.rs @@ -277,7 +277,7 @@ fn rescale_unbounded_numeric_or_error( const MAX_NUMERIC_PRECISION: u32 = 38; pub(crate) const DEFAULT_UNBOUNDED_NUMERIC_PRECISION: u32 = MAX_NUMERIC_PRECISION; -pub(crate) const DEFAULT_UNBOUNDED_NUMERIC_SCALE: u32 = 16; +pub(crate) const DEFAULT_UNBOUNDED_NUMERIC_SCALE: u32 = 9; pub(crate) const DEFAULT_UNBOUNDED_NUMERIC_MAX_INTEGRAL_DIGITS: u32 = DEFAULT_UNBOUNDED_NUMERIC_PRECISION - DEFAULT_UNBOUNDED_NUMERIC_SCALE; From fd51bed0bd2d1c2002cacf9870f4e3e825304772 Mon Sep 17 00:00:00 2001 From: aykut-bozkurt <51649454+aykut-bozkurt@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:09:57 +0300 Subject: [PATCH 2/3] Prepare v0.2.0 release (#87) --- CONTRIBUTING.md | 16 ++ Cargo.lock | 262 +++++++++++++++---------------- Cargo.toml | 2 +- sql/pg_parquet--0.1.0--0.2.0.sql | 1 + sql/pg_parquet.sql | 93 +++++++++++ 5 files changed, 241 insertions(+), 133 deletions(-) create mode 100644 sql/pg_parquet--0.1.0--0.2.0.sql create mode 100644 sql/pg_parquet.sql diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 10869e6..70d92e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -107,3 +107,19 @@ are started. You can also see the required environment variables from We use `cargo-fmt` as formatter and `cargo-clippy` as linter. You can check how we run them from [ci.yml](.github/workflows/ci.yml). + + +# Release + +We apply semantic versioning for our releases. We do not support long term release branches (backporting) yet. +The release process is as follows: + +1. Open PR to start release preparation, +2. Bump the package version at `Cargo.toml` file +3. Upgrade dependencies via `cargo update` +4. Use a schema diff tool, if possible (or manually), to generate + - sql upgrade file from previous release to the current release `pg_parquet---.sql` + - sql file of the current schema `pg_parquet.sql` +5. Merge the PR into main +6. Tag the latest commit with naming convention of `v..` +7. Release it with important and breaking (if any) changes diff --git a/Cargo.lock b/Cargo.lock index 7bcb35d..066b734 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,9 +88,9 @@ checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" [[package]] name = "arrow" @@ -406,7 +406,7 @@ dependencies = [ "hex", "hmac", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "once_cell", "percent-encoding", "sha2", @@ -416,9 +416,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" +checksum = "8aa8ff1492fd9fb99ae28e8467af0dbbb7c31512b16fabf1a0f10d7bb6ef78bb" dependencies = [ "futures-util", "pin-project-lite", @@ -475,9 +475,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.4" +version = "1.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" +checksum = "431a10d0e07e09091284ef04453dae4069283aa108d209974d67e77ae1caa658" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -490,7 +490,7 @@ dependencies = [ "http-body 0.4.6", "http-body 1.0.1", "httparse", - "hyper 0.14.31", + "hyper 0.14.32", "hyper-rustls 0.24.2", "once_cell", "pin-project-lite", @@ -510,7 +510,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "pin-project-lite", "tokio", "tracing", @@ -519,16 +519,16 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.9" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" +checksum = "8ecbf4d5dfb169812e2b240a4350f15ad3c6b03a54074e5712818801615f2dc5" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", "http 0.2.12", - "http 1.1.0", + "http 1.2.0", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -624,18 +624,18 @@ dependencies = [ [[package]] name = "bit-set" -version = "0.5.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" dependencies = [ "bit-vec", ] [[package]] name = "bit-vec" -version = "0.6.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" @@ -745,7 +745,7 @@ checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037" dependencies = [ "camino", "cargo-platform", - "semver 1.0.23", + "semver 1.0.24", "serde", "serde_json", "thiserror 1.0.69", @@ -763,9 +763,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.2" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +checksum = "9157bbaa6b165880c27a4293a474c91cdcf265cc68cc829bf10be0964a391caf" dependencies = [ "jobserver", "libc", @@ -805,9 +805,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", @@ -829,9 +829,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -850,9 +850,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstyle", "clap_lex", @@ -872,9 +872,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "const-random" @@ -951,9 +951,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -970,9 +970,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" @@ -1081,9 +1081,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fixedbitset" @@ -1286,7 +1286,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.1.0", + "http 1.2.0", "indexmap", "slab", "tokio", @@ -1365,11 +1365,11 @@ dependencies = [ [[package]] name = "home" -version = "0.5.9" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1385,9 +1385,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -1412,7 +1412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] @@ -1423,7 +1423,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "pin-project-lite", ] @@ -1448,9 +1448,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.31" +version = "0.14.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" dependencies = [ "bytes", "futures-channel", @@ -1472,15 +1472,15 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" dependencies = [ "bytes", "futures-channel", "futures-util", "h2 0.4.7", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "httparse", "itoa", @@ -1498,7 +1498,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", "http 0.2.12", - "hyper 0.14.31", + "hyper 0.14.32", "log", "rustls 0.21.12", "rustls-native-certs 0.6.3", @@ -1513,14 +1513,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", - "http 1.1.0", - "hyper 1.5.1", + "http 1.2.0", + "hyper 1.5.2", "hyper-util", - "rustls 0.23.19", + "rustls 0.23.20", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.1", "tower-service", ] @@ -1533,9 +1533,9 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.1", + "hyper 1.5.2", "pin-project-lite", "socket2", "tokio", @@ -1776,9 +1776,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ "once_cell", "wasm-bindgen", @@ -1792,9 +1792,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1805,9 +1805,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1816,9 +1816,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -1826,18 +1826,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -1846,9 +1846,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -1856,9 +1856,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.167" +version = "0.2.168" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +checksum = "5aaeb2981e0606ca11d79718f8bb01164f1d6ed75080182d3abf017e6d244b6d" [[package]] name = "libloading" @@ -2081,7 +2081,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.5.1", + "hyper 1.5.2", "itertools", "md-5", "parking_lot", @@ -2219,12 +2219,12 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" -version = "2.7.14" +version = "2.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" +checksum = "8b7cafe60d6cf8e62e1b9b2ea516a089c008945bb5a275416789e7db0bc199dc" dependencies = [ "memchr", - "thiserror 1.0.69", + "thiserror 2.0.7", "ucd-trie", ] @@ -2240,7 +2240,7 @@ dependencies = [ [[package]] name = "pg_parquet" -version = "0.1.0" +version = "0.2.0" dependencies = [ "arrow", "arrow-cast", @@ -2490,9 +2490,9 @@ dependencies = [ [[package]] name = "proptest" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" +checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" dependencies = [ "bit-set", "bit-vec", @@ -2535,9 +2535,9 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.1.0", - "rustls 0.23.19", + "rustls 0.23.20", "socket2", - "thiserror 2.0.3", + "thiserror 2.0.7", "tokio", "tracing", ] @@ -2553,10 +2553,10 @@ dependencies = [ "rand", "ring", "rustc-hash 2.1.0", - "rustls 0.23.19", + "rustls 0.23.20", "rustls-pki-types", "slab", - "thiserror 2.0.3", + "thiserror 2.0.7", "tinyvec", "tracing", "web-time", @@ -2564,9 +2564,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d5a626c6807713b15cac82a6acaccd6043c9a5408c24baae07611fec3f243da" +checksum = "52cd4b1eff68bf27940dd39811292c49e007f4d0b4c357358dc9b0197be6b527" dependencies = [ "cfg_aliases", "libc", @@ -2652,9 +2652,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ "bitflags 2.6.0", ] @@ -2705,10 +2705,10 @@ dependencies = [ "futures-core", "futures-util", "h2 0.4.7", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.1", + "hyper 1.5.2", "hyper-rustls 0.27.3", "hyper-util", "ipnet", @@ -2719,7 +2719,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.19", + "rustls 0.23.20", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustls-pki-types", @@ -2728,7 +2728,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.1", "tokio-util", "tower-service", "url", @@ -2787,20 +2787,20 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "semver 1.0.23", + "semver 1.0.24", ] [[package]] name = "rustix" -version = "0.38.41" +version = "0.38.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" dependencies = [ "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2817,9 +2817,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.19" +version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "934b404430bb06b3fae2cba809eb45a1ab1aecd64491213d7c3301b88393f8d1" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ "once_cell", "ring", @@ -2873,9 +2873,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" dependencies = [ "web-time", ] @@ -3006,9 +3006,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.23" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" dependencies = [ "serde", ] @@ -3030,9 +3030,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" dependencies = [ "serde_derive", ] @@ -3049,9 +3049,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.216" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" dependencies = [ "proc-macro2", "quote", @@ -3302,11 +3302,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.3" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" dependencies = [ - "thiserror-impl 2.0.3", + "thiserror-impl 2.0.7", ] [[package]] @@ -3322,9 +3322,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.3" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" dependencies = [ "proc-macro2", "quote", @@ -3408,9 +3408,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.1" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -3471,20 +3471,19 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.19", - "rustls-pki-types", + "rustls 0.23.20", "tokio", ] [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -3606,9 +3605,9 @@ checksum = "ccb97dac3243214f8d8507998906ca3e2e0b900bf9bf4870477f125b82e68f6e" [[package]] name = "unicode-bidi" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" @@ -3741,9 +3740,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -3752,13 +3751,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn", @@ -3767,9 +3765,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.47" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", @@ -3780,9 +3778,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3790,9 +3788,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -3803,9 +3801,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "wasm-streams" @@ -3822,9 +3820,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index dc8acc8..e77e462 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pg_parquet" -version = "0.1.0" +version = "0.2.0" edition = "2021" license-file = "LICENSE" diff --git a/sql/pg_parquet--0.1.0--0.2.0.sql b/sql/pg_parquet--0.1.0--0.2.0.sql new file mode 100644 index 0000000..1180ff2 --- /dev/null +++ b/sql/pg_parquet--0.1.0--0.2.0.sql @@ -0,0 +1 @@ +-- no changes diff --git a/sql/pg_parquet.sql b/sql/pg_parquet.sql new file mode 100644 index 0000000..dd5d010 --- /dev/null +++ b/sql/pg_parquet.sql @@ -0,0 +1,93 @@ +-- create roles for parquet object store read and write if they do not exist +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'parquet_object_store_read') THEN + CREATE ROLE parquet_object_store_read; + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'parquet_object_store_write') THEN + CREATE ROLE parquet_object_store_write; + END IF; +END $$; + +-- error if the schema already exists +CREATE SCHEMA parquet; +REVOKE ALL ON SCHEMA parquet FROM public; +GRANT USAGE ON SCHEMA parquet TO public; +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA parquet TO public; + +-- parquet key value metadata function +CREATE FUNCTION parquet."kv_metadata"( + "uri" TEXT +) RETURNS TABLE ( + "uri" TEXT, + "key" BYTEA, + "value" BYTEA +) +STRICT +LANGUAGE c +AS 'MODULE_PATHNAME', 'kv_metadata_wrapper'; + +-- parquet metadata function +CREATE FUNCTION parquet."metadata"( + "uri" TEXT +) RETURNS TABLE ( + "uri" TEXT, + "row_group_id" BIGINT, + "row_group_num_rows" BIGINT, + "row_group_num_columns" BIGINT + "row_group_bytes" BIGINT, + "column_id" BIGINT, + "file_offset" BIGINT, + "num_values" BIGINT, + "path_in_schema" TEXT, + "type_name" TEXT, + "stats_null_count" BIGINT, + "stats_distinct_count" BIGINT, + "stats_min" TEXT, + "stats_max" TEXT, + "compression" TEXT, + "encodings" TEXT, + "index_page_offset" BIGINT, + "dictionary_page_offset" BIGINT, + "data_page_offset" BIGINT, + "total_compressed_size" BIGINT, + "total_uncompressed_size" BIGINT +) +STRICT +LANGUAGE c +AS 'MODULE_PATHNAME', 'metadata_wrapper'; + +-- parquet file metadata function +CREATE FUNCTION parquet."file_metadata"( + "uri" TEXT +) RETURNS TABLE ( + "uri" TEXT, + "created_by" TEXT, + "num_rows" BIGINT, + "num_row_groups" BIGINT, + "format_version" TEXT +) +STRICT +LANGUAGE c +AS 'MODULE_PATHNAME', 'file_metadata_wrapper'; + +CREATE SCHEMA IF NOT EXISTS parquet; + +CREATE FUNCTION parquet."schema"( + "uri" TEXT +) RETURNS TABLE ( + "uri" TEXT, + "name" TEXT, + "type_name" TEXT, + "type_length" TEXT, + "repetition_type" TEXT, + "num_children" INT, + "converted_type" TEXT, + "scale" INT, + "precision" INT, + "field_id" INT, + "logical_type" TEXT +) +STRICT +LANGUAGE c +AS 'MODULE_PATHNAME', 'schema_wrapper'; From 2c1a62d0bacdd51afeafe52b20cd92c7c69d0096 Mon Sep 17 00:00:00 2001 From: aykut-bozkurt <51649454+aykut-bozkurt@users.noreply.github.com> Date: Tue, 24 Dec 2024 17:44:48 +0300 Subject: [PATCH 3/3] Fix memory leaks for huge tables (#89) We process each row group sequentially during "COPY FROM parquet". Normally, we expect that memory consumption does not exceed too much the row group size. But we also do some allocations during the copy at current Postgres context, which can be extreme for some huge tables (e.g. with 100 columns and default row group size ~ 123000) To fix the issue, we intoduce a memory context that is used and freed per each row during the copy. --- src/arrow_parquet/parquet_reader.rs | 58 +++++++++++++++++------------ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/src/arrow_parquet/parquet_reader.rs b/src/arrow_parquet/parquet_reader.rs index 9a2cf1d..6790513 100644 --- a/src/arrow_parquet/parquet_reader.rs +++ b/src/arrow_parquet/parquet_reader.rs @@ -10,7 +10,8 @@ use pgrx::{ fmgr_info, getTypeBinaryOutputInfo, varlena, Datum, FmgrInfo, FormData_pg_attribute, InvalidOid, SendFunctionCall, }, - vardata_any, varsize_any_exhdr, void_mut_ptr, AllocatedByPostgres, PgBox, PgTupleDesc, + vardata_any, varsize_any_exhdr, void_mut_ptr, AllocatedByPostgres, PgBox, PgMemoryContexts, + PgTupleDesc, }; use url::Url; @@ -44,6 +45,7 @@ pub(crate) struct ParquetReaderContext { attribute_contexts: Vec, binary_out_funcs: Vec>, match_by: MatchBy, + per_row_memory_ctx: PgMemoryContexts, } impl ParquetReaderContext { @@ -88,6 +90,8 @@ impl ParquetReaderContext { let binary_out_funcs = Self::collect_binary_out_funcs(&attributes); + let per_row_memory_ctx = PgMemoryContexts::new("COPY FROM parquet per row memory context"); + ParquetReaderContext { buffer: Vec::new(), offset: 0, @@ -97,6 +101,7 @@ impl ParquetReaderContext { match_by, started: false, finished: false, + per_row_memory_ctx, } } @@ -172,12 +177,10 @@ impl ParquetReaderContext { } if !self.started { - // starts PG copy protocol + // starts PG copy self.copy_start(); } - let natts = self.attribute_contexts.len() as i16; - // read a record batch from the parquet file. Record batch will contain // DEFAULT_BATCH_SIZE rows as we configured in the parquet reader. let record_batch = PG_BACKEND_TOKIO_RUNTIME.block_on(self.parquet_reader.next()); @@ -193,8 +196,21 @@ impl ParquetReaderContext { // slice the record batch to get the next row let record_batch = record_batch.slice(i, 1); + self.copy_row(record_batch); + } + } else { + // finish PG copy + self.copy_finish(); + } + + true + } + fn copy_row(&mut self, record_batch: RecordBatch) { + unsafe { + self.per_row_memory_ctx.switch_to(|_context| { /* 2 bytes: per-tuple header */ + let natts = self.attribute_contexts.len() as i16; let attnum_len_bytes = natts.to_be_bytes(); self.buffer.extend_from_slice(&attnum_len_bytes); @@ -209,20 +225,17 @@ impl ParquetReaderContext { for (datum, out_func) in tuple_datums.into_iter().zip(self.binary_out_funcs.iter()) { if let Some(datum) = datum { - unsafe { - let datum_bytes: *mut varlena = - SendFunctionCall(out_func.as_ptr(), datum); - - /* 4 bytes: attribute's data size */ - let data_size = varsize_any_exhdr(datum_bytes); - let data_size_bytes = (data_size as i32).to_be_bytes(); - self.buffer.extend_from_slice(&data_size_bytes); - - /* variable bytes: attribute's data */ - let data = vardata_any(datum_bytes) as _; - let data_bytes = std::slice::from_raw_parts(data, data_size); - self.buffer.extend_from_slice(data_bytes); - }; + let datum_bytes: *mut varlena = SendFunctionCall(out_func.as_ptr(), datum); + + /* 4 bytes: attribute's data size */ + let data_size = varsize_any_exhdr(datum_bytes); + let data_size_bytes = (data_size as i32).to_be_bytes(); + self.buffer.extend_from_slice(&data_size_bytes); + + /* variable bytes: attribute's data */ + let data = vardata_any(datum_bytes) as _; + let data_bytes = std::slice::from_raw_parts(data, data_size); + self.buffer.extend_from_slice(data_bytes); } else { /* 4 bytes: null */ let null_value = -1_i32; @@ -230,13 +243,10 @@ impl ParquetReaderContext { self.buffer.extend_from_slice(&null_value_bytes); } } - } - } else { - // finish PG copy protocol - self.copy_finish(); - } + }); - true + self.per_row_memory_ctx.reset(); + }; } fn copy_start(&mut self) {