diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9bd42dbaa0d6..9c4cda5d034d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,3 +7,9 @@ updates: open-pull-requests-limit: 10 target-branch: master labels: [auto-dependencies] + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + open-pull-requests-limit: 10 + labels: [auto-dependencies] diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index f8456c5368f8..16cd9a9a0a5a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -41,7 +41,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -51,9 +51,9 @@ jobs: - name: Test run: | cargo test -p arrow - - name: Test --features=force_validate,prettyprint + - name: Test --features=force_validate,prettyprint,ffi run: | - cargo test -p arrow --features=force_validate,prettyprint + cargo test -p arrow --features=force_validate,prettyprint,ffi - name: Run examples run: | # Test arrow examples @@ -73,7 +73,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -107,7 +107,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -136,7 +136,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Cache Cargo @@ -153,8 +153,8 @@ jobs: - name: Build run: | cd arrow - cargo build --no-default-features --features=csv,ipc,simd --target wasm32-unknown-unknown - cargo build --no-default-features --features=csv,ipc,simd --target wasm32-wasi + cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-unknown-unknown + cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-wasi clippy: name: Clippy @@ -162,7 +162,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -172,4 +172,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils --all-targets -- -D warnings + cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 5e5538121164..86a67ff9a6a4 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -43,7 +43,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -63,7 +63,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml index 1cf7a1356037..a98c8ee5d225 100644 --- a/.github/workflows/cancel.yml +++ b/.github/workflows/cancel.yml @@ -16,7 +16,7 @@ # under the License. # Attempt to cancel stale workflow runs to save github actions runner time -name: Cancel stale runs +name: cancel on: workflow_run: diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml deleted file mode 100644 index 6ca095328af1..000000000000 --- a/.github/workflows/comment_bot.yml +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Comment Bot - -on: - # TODO(kszucs): support pull_request_review_comment - issue_comment: - types: - - created - - edited - -jobs: - crossbow: - name: Listen! - if: startsWith(github.event.comment.body, '@github-actions crossbow') - runs-on: ubuntu-latest - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - repository: apache/arrow - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Install Archery and Crossbow dependencies - run: pip install -e dev/archery[bot] - - name: Handle Github comment event - env: - ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} - run: | - archery trigger-bot \ - --event-name ${{ github.event_name }} \ - --event-payload ${{ github.event_path }} - - rebase: - name: "Rebase" - if: startsWith(github.event.comment.body, '@github-actions rebase') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/pr-fetch@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Rebase on ${{ github.repository }} master - run: | - set -ex - git config user.name "$(git log -1 --pretty=format:%an)" - git config user.email "$(git log -1 --pretty=format:%ae)" - git remote add upstream https://github.com/${{ github.repository }} - git fetch --unshallow upstream master - git rebase upstream/master - - uses: r-lib/actions/pr-push@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - args: "--force" diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 000000000000..e688428e187c --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: coverage + +# Trigger only on pushes to master, not pull requests +on: + push: + branches: + - master + +jobs: + + coverage: + name: Coverage + runs-on: ubuntu-latest + # Note runs outside of a container + # otherwise we get this error: + # Failed to run tests: ASLR disable failed: EPERM: Operation not permitted + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Rust toolchain + run: | + rustup toolchain install stable + rustup default stable + - name: Install protobuf compiler in /protoc + run: | + sudo mkdir /protoc + sudo chmod a+rwx /protoc + cd /protoc + curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-linux-x86_64.zip + unzip protoc-21.4-linux-x86_64.zip + - name: Cache Cargo + uses: actions/cache@v3 + with: + path: /home/runner/.cargo + key: cargo-coverage-cache3- + - name: Run coverage + run: | + export PATH=$PATH:/protoc/bin + rustup toolchain install stable + rustup default stable + cargo install --version 0.18.2 cargo-tarpaulin + cargo tarpaulin --all --out Xml + - name: Report coverage + continue-on-error: true + run: bash <(curl -s https://codecov.io/bash) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 21263a9211e1..57dc19482761 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Dev +name: dev # trigger for all PRs and changes to master on: @@ -34,24 +34,24 @@ jobs: name: Release Audit Tool (RAT) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 - name: Audit licenses run: ./dev/release/run-rat.sh . prettier: - name: Use prettier to check formatting of markdown documents + name: Markdown format runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-node@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-node@v3 with: node-version: "14" - name: Prettier check run: | - # if you encounter error, try rerun the command below with --write instead of --check - # and commit the changes - npx prettier@2.3.0 --check {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md + # if you encounter error, run the command below and commit the changes + npx prettier@2.3.2 --write {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md + git diff --exit-code diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 093d376713d8..64f7ecc0039f 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Dev PR +name: dev_pr # Trigger whenever a PR is changed (title as well as new / changed commits) on: @@ -30,14 +30,14 @@ jobs: name: Process runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Assign GitHub labels if: | github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@2.2.0 + uses: actions/labeler@v4.0.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b3f6d9b61664..5e82d76febe6 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Docs +name: docs # trigger for all PRs and changes to master on: @@ -39,7 +39,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Install python dev diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 71ec99d8dce2..81969466ebf4 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Integration +name: integration # trigger for all PRs that touch certain files and changes to master on: @@ -36,18 +36,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Arrow - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: apache/arrow submodules: true fetch-depth: 0 - name: Checkout Arrow Rust - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: rust fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: 3.8 - name: Setup Archery @@ -63,7 +63,7 @@ jobs: matrix: rust: [stable] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -82,7 +82,7 @@ jobs: path: /home/runner/target # this key is not equal because maturin uses different compilation flags. key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: '3.7' - name: Upgrade pip and setuptools diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 732f92a1c36a..b4669bbcccc0 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: MIRI +name: miri # trigger for all PRs that touch certain files and changes to master on: @@ -32,7 +32,7 @@ jobs: name: MIRI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index bf07a2efaad6..6c81604a96a2 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -36,7 +36,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain with clippy run: | rustup toolchain install stable @@ -44,7 +44,7 @@ jobs: rustup component add clippy - name: Run clippy run: | - cargo clippy -p object_store --all-features + cargo clippy -p object_store --all-features -- -D warnings # test the crate linux-test: @@ -85,7 +85,7 @@ jobs: OBJECT_STORE_BUCKET: test-bucket steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Configure Fake GCS Server (GCP emulation) run: | diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index d8e09f04ba83..e3f66751044f 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -43,7 +43,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -69,7 +69,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -118,7 +118,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -128,8 +128,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - # Only run clippy for the library at this time, - # as there are clippy errors for other targets - cargo clippy -p parquet --all-features --lib -- -D warnings - # https://github.com/apache/arrow-rs/issues/1254 - #cargo clippy -p parquet --all-targets --all-features -- -D warnings + cargo clippy -p parquet --all-targets --all-features -- -D warnings diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index f7176498c55d..bd70fc30d1c5 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -44,7 +44,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -61,7 +61,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4a54c22b3545..c04d5643b49a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -# tests for workspace wide -name: Rust +# workspace wide tests +name: rust # trigger for all PRs and changes to master on: @@ -33,7 +33,7 @@ jobs: name: Test on Mac runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Install protoc with brew @@ -57,7 +57,7 @@ jobs: name: Test on Windows runs-on: windows-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Install protobuf compiler in /d/protoc @@ -90,7 +90,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup toolchain run: | rustup toolchain install stable @@ -98,41 +98,3 @@ jobs: rustup component add rustfmt - name: Run run: cargo fmt --all -- --check - - - coverage: - name: Coverage - runs-on: ubuntu-latest - # Note runs outside of a container - # otherwise we get this error: - # Failed to run tests: ASLR disable failed: EPERM: Operation not permitted - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Setup Rust toolchain - run: | - rustup toolchain install stable - rustup default stable - - name: Install protobuf compiler in /protoc - run: | - sudo mkdir /protoc - sudo chmod a+rwx /protoc - cd /protoc - curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-linux-x86_64.zip - unzip protoc-21.4-linux-x86_64.zip - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /home/runner/.cargo - key: cargo-coverage-cache3- - - name: Run coverage - run: | - export PATH=$PATH:/protoc/bin - rustup toolchain install stable - rustup default stable - cargo install --version 0.18.2 cargo-tarpaulin - cargo tarpaulin --all --out Xml - - name: Report coverage - continue-on-error: true - run: bash <(curl -s https://codecov.io/bash) diff --git a/.github_changelog_generator b/.github_changelog_generator index cc23a6332d60..9a9a84344866 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -24,5 +24,5 @@ add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":[" #pull-requests=false # so that the component is shown associated with the issue issue-line-labels=arrow,parquet,arrow-flight -exclude-labels=development-process,invalid +exclude-labels=development-process,invalid,object-store breaking_labels=api-change diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c0049af39b93..25be8961d2d8 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -20,6 +20,96 @@ # Historical Changelog +## [19.0.0](https://github.com/apache/arrow-rs/tree/19.0.0) (2022-07-22) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/18.0.0...19.0.0) + +**Breaking changes:** + +- Rename `DecimalArray``/DecimalBuilder` to `Decimal128Array`/`Decimal128Builder` [\#2101](https://github.com/apache/arrow-rs/issues/2101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Change builder `append` methods to be infallible where possible [\#2103](https://github.com/apache/arrow-rs/pull/2103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Return reference from `UnionArray::child` \(\#2035\) [\#2099](https://github.com/apache/arrow-rs/pull/2099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `preserve_order` feature from `serde_json` dependency \(\#2095\) [\#2098](https://github.com/apache/arrow-rs/pull/2098) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `num_days_since_sunday` [\#2066](https://github.com/apache/arrow-rs/pull/2066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Remove `null_count` from `write_batch_with_statistics` [\#2047](https://github.com/apache/arrow-rs/pull/2047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Use `total_cmp` from std [\#2130](https://github.com/apache/arrow-rs/issues/2130) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Permit parallel fetching of column chunks in `ParquetRecordBatchStream` [\#2110](https://github.com/apache/arrow-rs/issues/2110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- The `GenericBinaryBuilder` should use buffer builders directly. [\#2104](https://github.com/apache/arrow-rs/issues/2104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pass `generate_decimal256_case` arrow integration test [\#2093](https://github.com/apache/arrow-rs/issues/2093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `days_since_sunday` [\#2065](https://github.com/apache/arrow-rs/issues/2065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `filter_dict` [\#2062](https://github.com/apache/arrow-rs/issues/2062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `set_bits` [\#2060](https://github.com/apache/arrow-rs/issues/2060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Lazily materialize the null buffer builder of `BooleanBuilder` [\#2058](https://github.com/apache/arrow-rs/issues/2058) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `BooleanArray::from_iter` should omit validity buffer if all values are valid [\#2055](https://github.com/apache/arrow-rs/issues/2055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FFI\_ArrowSchema should set `DICTIONARY_ORDERED` flag if a field's dictionary is ordered [\#2049](https://github.com/apache/arrow-rs/issues/2049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `peek_next_page()` and `skip_next_page` in `SerializedPageReader` [\#2043](https://github.com/apache/arrow-rs/issues/2043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support FFI / C Data Interface for `MapType` [\#2037](https://github.com/apache/arrow-rs/issues/2037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The `DecimalArrayBuilder` should use `FixedSizedBinaryBuilder` [\#2026](https://github.com/apache/arrow-rs/issues/2026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable `serialized_reader` read specific Page by passing row ranges. [\#1976](https://github.com/apache/arrow-rs/issues/1976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- `type_id` and `value_offset` are incorrect for sliced `UnionArray` [\#2086](https://github.com/apache/arrow-rs/issues/2086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Boolean `take` kernel does not handle null indices correctly [\#2057](https://github.com/apache/arrow-rs/issues/2057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Don't double-count nulls in `write_batch_with_statistics` [\#2046](https://github.com/apache/arrow-rs/issues/2046) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet Writer Ignores Statistics specification in `WriterProperties` [\#2014](https://github.com/apache/arrow-rs/issues/2014) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Improve docstrings + examples for `as_primitive_array` cast functions [\#2114](https://github.com/apache/arrow-rs/pull/2114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Why does `serde_json` specify the `preserve_order` feature in `arrow` package [\#2095](https://github.com/apache/arrow-rs/issues/2095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `skip_values` in DictionaryDecoder [\#2079](https://github.com/apache/arrow-rs/issues/2079) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support skip\_values in ColumnValueDecoderImpl [\#2078](https://github.com/apache/arrow-rs/issues/2078) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support `skip_values` in `ByteArrayColumnValueDecoder` [\#2072](https://github.com/apache/arrow-rs/issues/2072) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Several `Builder::append` methods returning results even though they are infallible [\#2071](https://github.com/apache/arrow-rs/issues/2071) +- Improve formatting of logical plans containing subqueries [\#2059](https://github.com/apache/arrow-rs/issues/2059) +- Return reference from `UnionArray::child` [\#2035](https://github.com/apache/arrow-rs/issues/2035) +- support write page index [\#1777](https://github.com/apache/arrow-rs/issues/1777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Use `total_cmp` from std [\#2131](https://github.com/apache/arrow-rs/pull/2131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- fix clippy [\#2124](https://github.com/apache/arrow-rs/pull/2124) ([alamb](https://github.com/alamb)) +- Fix logical merge conflict: `match` arms have incompatible types [\#2121](https://github.com/apache/arrow-rs/pull/2121) ([alamb](https://github.com/alamb)) +- Update `GenericBinaryBuilder` to use buffer builders directly. [\#2117](https://github.com/apache/arrow-rs/pull/2117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Simplify null mask preservation in parquet reader [\#2116](https://github.com/apache/arrow-rs/pull/2116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add get\_byte\_ranges method to AsyncFileReader trait [\#2115](https://github.com/apache/arrow-rs/pull/2115) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- add test for skip\_values in DictionaryDecoder and fix it [\#2105](https://github.com/apache/arrow-rs/pull/2105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Define Decimal128Builder and Decimal128Array [\#2102](https://github.com/apache/arrow-rs/pull/2102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support skip\_values in DictionaryDecoder [\#2100](https://github.com/apache/arrow-rs/pull/2100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Pass generate\_decimal256\_case integration test, add `DataType::Decimal256` [\#2094](https://github.com/apache/arrow-rs/pull/2094) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- `DecimalBuilder` should use `FixedSizeBinaryBuilder` [\#2092](https://github.com/apache/arrow-rs/pull/2092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Array writer indirection [\#2091](https://github.com/apache/arrow-rs/pull/2091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove doc hidden from GenericColumnReader [\#2090](https://github.com/apache/arrow-rs/pull/2090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support skip\_values in ColumnValueDecoderImpl [\#2089](https://github.com/apache/arrow-rs/pull/2089) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- type\_id and value\_offset are incorrect for sliced UnionArray [\#2087](https://github.com/apache/arrow-rs/pull/2087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add IPC truncation test case for StructArray [\#2083](https://github.com/apache/arrow-rs/pull/2083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve performance of set\_bits by using copy\_from\_slice instead of setting individual bytes [\#2077](https://github.com/apache/arrow-rs/pull/2077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Support skip\_values in ByteArrayColumnValueDecoder [\#2076](https://github.com/apache/arrow-rs/pull/2076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Lazily materialize the null buffer builder of boolean builder [\#2073](https://github.com/apache/arrow-rs/pull/2073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix windows CI \(\#2069\) [\#2070](https://github.com/apache/arrow-rs/pull/2070) ([tustvold](https://github.com/tustvold)) +- Test utf8\_validation checks char boundaries [\#2068](https://github.com/apache/arrow-rs/pull/2068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat\(compute\): Support doy \(day of year\) for temporal [\#2067](https://github.com/apache/arrow-rs/pull/2067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) +- Support nullable indices in boolean take kernel and some optimizations [\#2064](https://github.com/apache/arrow-rs/pull/2064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Improve performance of filter\_dict [\#2063](https://github.com/apache/arrow-rs/pull/2063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Ignore null buffer when creating ArrayData if null count is zero [\#2056](https://github.com/apache/arrow-rs/pull/2056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- feat\(compute\): Support week0 \(PostgreSQL behaviour\) for temporal [\#2052](https://github.com/apache/arrow-rs/pull/2052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) +- Set DICTIONARY\_ORDERED flag for FFI\_ArrowSchema [\#2050](https://github.com/apache/arrow-rs/pull/2050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Generify parquet write path \(\#1764\) [\#2045](https://github.com/apache/arrow-rs/pull/2045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support peek\_next\_page\(\) and skip\_next\_page in serialized\_reader. [\#2044](https://github.com/apache/arrow-rs/pull/2044) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Support MapType in FFI [\#2042](https://github.com/apache/arrow-rs/pull/2042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add support of converting `FixedSizeBinaryArray` to `DecimalArray` [\#2041](https://github.com/apache/arrow-rs/pull/2041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Truncate IPC record batch [\#2040](https://github.com/apache/arrow-rs/pull/2040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Refine the List builder [\#2034](https://github.com/apache/arrow-rs/pull/2034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add more tests of RecordReader Batch Size Edge Cases \(\#2025\) [\#2032](https://github.com/apache/arrow-rs/pull/2032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add support for adding intervals to dates [\#2031](https://github.com/apache/arrow-rs/pull/2031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) + ## [18.0.0](https://github.com/apache/arrow-rs/tree/18.0.0) (2022-07-08) [Full Changelog](https://github.com/apache/arrow-rs/compare/17.0.0...18.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9ca0d911016..87f67015f22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,95 +19,155 @@ # Changelog -## [19.0.0](https://github.com/apache/arrow-rs/tree/19.0.0) (2022-07-22) +## [20.0.0](https://github.com/apache/arrow-rs/tree/20.0.0) (2022-08-05) -[Full Changelog](https://github.com/apache/arrow-rs/compare/18.0.0...19.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/19.0.0...20.0.0) **Breaking changes:** -- Rename `DecimalArray``/DecimalBuilder` to `Decimal128Array`/`Decimal128Builder` [\#2101](https://github.com/apache/arrow-rs/issues/2101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Change builder `append` methods to be infallible where possible [\#2103](https://github.com/apache/arrow-rs/pull/2103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Return reference from `UnionArray::child` \(\#2035\) [\#2099](https://github.com/apache/arrow-rs/pull/2099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove `preserve_order` feature from `serde_json` dependency \(\#2095\) [\#2098](https://github.com/apache/arrow-rs/pull/2098) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `num_days_since_sunday` [\#2066](https://github.com/apache/arrow-rs/pull/2066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Remove `null_count` from `write_batch_with_statistics` [\#2047](https://github.com/apache/arrow-rs/pull/2047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add more const evaluation for `GenericBinaryArray` and `GenericListArray`: add `PREFIX` and data type constructor [\#2327](https://github.com/apache/arrow-rs/pull/2327) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Make FFI support optional, change APIs to be `safe` \(\#2302\) [\#2303](https://github.com/apache/arrow-rs/pull/2303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `test_utils` from default features \(\#2298\) [\#2299](https://github.com/apache/arrow-rs/pull/2299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Rename `DataType::Decimal` to `DataType::Decimal128` [\#2229](https://github.com/apache/arrow-rs/pull/2229) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add `Decimal128Iter` and `Decimal256Iter` and do maximum precision/scale check [\#2140](https://github.com/apache/arrow-rs/pull/2140) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) **Implemented enhancements:** -- Use `total_cmp` from std [\#2130](https://github.com/apache/arrow-rs/issues/2130) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Permit parallel fetching of column chunks in `ParquetRecordBatchStream` [\#2110](https://github.com/apache/arrow-rs/issues/2110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- The `GenericBinaryBuilder` should use buffer builders directly. [\#2104](https://github.com/apache/arrow-rs/issues/2104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Pass `generate_decimal256_case` arrow integration test [\#2093](https://github.com/apache/arrow-rs/issues/2093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `days_since_sunday` [\#2065](https://github.com/apache/arrow-rs/issues/2065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `filter_dict` [\#2062](https://github.com/apache/arrow-rs/issues/2062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `set_bits` [\#2060](https://github.com/apache/arrow-rs/issues/2060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Lazily materialize the null buffer builder of `BooleanBuilder` [\#2058](https://github.com/apache/arrow-rs/issues/2058) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `BooleanArray::from_iter` should omit validity buffer if all values are valid [\#2055](https://github.com/apache/arrow-rs/issues/2055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- FFI\_ArrowSchema should set `DICTIONARY_ORDERED` flag if a field's dictionary is ordered [\#2049](https://github.com/apache/arrow-rs/issues/2049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `peek_next_page()` and `skip_next_page` in `SerializedPageReader` [\#2043](https://github.com/apache/arrow-rs/issues/2043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support FFI / C Data Interface for `MapType` [\#2037](https://github.com/apache/arrow-rs/issues/2037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The `DecimalArrayBuilder` should use `FixedSizedBinaryBuilder` [\#2026](https://github.com/apache/arrow-rs/issues/2026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Enable `serialized_reader` read specific Page by passing row ranges. [\#1976](https://github.com/apache/arrow-rs/issues/1976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add the constant data type constructors for `ListArray` [\#2311](https://github.com/apache/arrow-rs/issues/2311) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update `FlightSqlService` trait to pass session info along [\#2308](https://github.com/apache/arrow-rs/issues/2308) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Optimize `take_bits` for non-null indices [\#2306](https://github.com/apache/arrow-rs/issues/2306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make FFI support optional via Feature Flag `ffi` [\#2302](https://github.com/apache/arrow-rs/issues/2302) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Mark `ffi::ArrowArray::try_new` is safe [\#2301](https://github.com/apache/arrow-rs/issues/2301) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove test\_utils from default arrow-rs features [\#2298](https://github.com/apache/arrow-rs/issues/2298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove `JsonEqual` trait [\#2296](https://github.com/apache/arrow-rs/issues/2296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move `with_precision_and_scale` to `Decimal` array traits [\#2291](https://github.com/apache/arrow-rs/issues/2291) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve readability and maybe performance of string --\> numeric/time/date/timetamp cast kernels [\#2285](https://github.com/apache/arrow-rs/issues/2285) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add vectorized unpacking for 8, 16, and 64 bit integers [\#2276](https://github.com/apache/arrow-rs/issues/2276) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Use initial capacity for interner hashmap [\#2273](https://github.com/apache/arrow-rs/issues/2273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Impl FromIterator for Decimal256Array [\#2248](https://github.com/apache/arrow-rs/issues/2248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Separate `ArrayReader::next_batch`with `ArrayReader::read_records` and `ArrayReader::consume_batch` [\#2236](https://github.com/apache/arrow-rs/issues/2236) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rename `DataType::Decimal` to `DataType::Decimal128` [\#2228](https://github.com/apache/arrow-rs/issues/2228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Automatically Grow Parquet BitWriter Buffer [\#2226](https://github.com/apache/arrow-rs/issues/2226) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `append_option` support to `Decimal128Builder` and `Decimal256Builder` [\#2224](https://github.com/apache/arrow-rs/issues/2224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Split the `FixedSizeBinaryArray` and `FixedSizeListArray` from `array_binary.rs` and `array_list.rs` [\#2217](https://github.com/apache/arrow-rs/issues/2217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Don't `Box` Values in `PrimitiveDictionaryBuilder` [\#2215](https://github.com/apache/arrow-rs/issues/2215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use BitChunks in equal\_bits [\#2186](https://github.com/apache/arrow-rs/issues/2186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `Hash` for `Schema` [\#2182](https://github.com/apache/arrow-rs/issues/2182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- read decimal data type from parquet file with binary physical type [\#2159](https://github.com/apache/arrow-rs/issues/2159) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- The `GenericStringBuilder` should use `GenericBinaryBuilder` [\#2156](https://github.com/apache/arrow-rs/issues/2156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update Rust version to 1.62 [\#2143](https://github.com/apache/arrow-rs/issues/2143) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Check precision and scale against maximum value when constructing `Decimal128` and `Decimal256` [\#2139](https://github.com/apache/arrow-rs/issues/2139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` in `Decimal128Iter` and `Decimal256Iter` [\#2138](https://github.com/apache/arrow-rs/issues/2138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` and `FromIterator` in Cast Kernels [\#2137](https://github.com/apache/arrow-rs/issues/2137) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `TypedDictionaryArray` for more ergonomic interaction with `DictionaryArray` [\#2136](https://github.com/apache/arrow-rs/issues/2136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` in Comparison Kernels [\#2135](https://github.com/apache/arrow-rs/issues/2135) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `peek_next_page()` and s`kip_next_page` in `InMemoryColumnChunkReader` [\#2129](https://github.com/apache/arrow-rs/issues/2129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Lazily materialize the null buffer builder for all array builders. [\#2125](https://github.com/apache/arrow-rs/issues/2125) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Do value validation for `Decimal256` [\#2112](https://github.com/apache/arrow-rs/issues/2112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `skip_def_levels` for `ColumnLevelDecoder` [\#2107](https://github.com/apache/arrow-rs/issues/2107) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add integration test for scan rows with selection [\#2106](https://github.com/apache/arrow-rs/issues/2106) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support for casting from Utf8/String to `Time32` / `Time64` [\#2053](https://github.com/apache/arrow-rs/issues/2053) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update prost and tonic related crates [\#2268](https://github.com/apache/arrow-rs/pull/2268) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([carols10cents](https://github.com/carols10cents)) **Fixed bugs:** -- `type_id` and `value_offset` are incorrect for sliced `UnionArray` [\#2086](https://github.com/apache/arrow-rs/issues/2086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Boolean `take` kernel does not handle null indices correctly [\#2057](https://github.com/apache/arrow-rs/issues/2057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Don't double-count nulls in `write_batch_with_statistics` [\#2046](https://github.com/apache/arrow-rs/issues/2046) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet Writer Ignores Statistics specification in `WriterProperties` [\#2014](https://github.com/apache/arrow-rs/issues/2014) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- temporal conversion functions cannot work on negative input properly [\#2325](https://github.com/apache/arrow-rs/issues/2325) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- IPC writer should truncate string array with all empty string [\#2312](https://github.com/apache/arrow-rs/issues/2312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error order for comparing `Decimal128` or `Decimal256` [\#2256](https://github.com/apache/arrow-rs/issues/2256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix maximum and minimum for decimal values for precision greater than 38 [\#2246](https://github.com/apache/arrow-rs/issues/2246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `IntervalMonthDayNanoType::make_value()` does not match C implementation [\#2234](https://github.com/apache/arrow-rs/issues/2234) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FlightSqlService` trait does not allow `impl`s to do handshake [\#2210](https://github.com/apache/arrow-rs/issues/2210) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- `EnabledStatistics::None` not working [\#2185](https://github.com/apache/arrow-rs/issues/2185) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Boolean ArrayData Equality Incorrect Slice Handling [\#2184](https://github.com/apache/arrow-rs/issues/2184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Publicly export MapFieldNames [\#2118](https://github.com/apache/arrow-rs/issues/2118) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- Improve docstrings + examples for `as_primitive_array` cast functions [\#2114](https://github.com/apache/arrow-rs/pull/2114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update instructions on How to join the slack \#arrow-rust channel -- or maybe try to switch to discord?? [\#2192](https://github.com/apache/arrow-rs/issues/2192) +- \[Minor\] Improve arrow and parquet READMEs, document parquet feature flags [\#2324](https://github.com/apache/arrow-rs/pull/2324) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Improve speed of writing string dictionaries to parquet by skipping a copy\(\#1764\) [\#2322](https://github.com/apache/arrow-rs/pull/2322) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Closed issues:** -- Why does `serde_json` specify the `preserve_order` feature in `arrow` package [\#2095](https://github.com/apache/arrow-rs/issues/2095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `skip_values` in DictionaryDecoder [\#2079](https://github.com/apache/arrow-rs/issues/2079) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support skip\_values in ColumnValueDecoderImpl [\#2078](https://github.com/apache/arrow-rs/issues/2078) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support `skip_values` in `ByteArrayColumnValueDecoder` [\#2072](https://github.com/apache/arrow-rs/issues/2072) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Several `Builder::append` methods returning results even though they are infallible [\#2071](https://github.com/apache/arrow-rs/issues/2071) -- Improve formatting of logical plans containing subqueries [\#2059](https://github.com/apache/arrow-rs/issues/2059) -- Return reference from `UnionArray::child` [\#2035](https://github.com/apache/arrow-rs/issues/2035) -- support write page index [\#1777](https://github.com/apache/arrow-rs/issues/1777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Fix wrong logic in calculate\_row\_count when skipping values [\#2328](https://github.com/apache/arrow-rs/issues/2328) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support filter for parquet data type [\#2126](https://github.com/apache/arrow-rs/issues/2126) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make skip value in ByteArrayDecoderDictionary avoid decoding [\#2088](https://github.com/apache/arrow-rs/issues/2088) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Merged pull requests:** -- Use `total_cmp` from std [\#2131](https://github.com/apache/arrow-rs/pull/2131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- fix clippy [\#2124](https://github.com/apache/arrow-rs/pull/2124) ([alamb](https://github.com/alamb)) -- Fix logical merge conflict: `match` arms have incompatible types [\#2121](https://github.com/apache/arrow-rs/pull/2121) ([alamb](https://github.com/alamb)) -- Update `GenericBinaryBuilder` to use buffer builders directly. [\#2117](https://github.com/apache/arrow-rs/pull/2117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Simplify null mask preservation in parquet reader [\#2116](https://github.com/apache/arrow-rs/pull/2116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add get\_byte\_ranges method to AsyncFileReader trait [\#2115](https://github.com/apache/arrow-rs/pull/2115) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- add test for skip\_values in DictionaryDecoder and fix it [\#2105](https://github.com/apache/arrow-rs/pull/2105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Define Decimal128Builder and Decimal128Array [\#2102](https://github.com/apache/arrow-rs/pull/2102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support skip\_values in DictionaryDecoder [\#2100](https://github.com/apache/arrow-rs/pull/2100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- Pass generate\_decimal256\_case integration test, add `DataType::Decimal256` [\#2094](https://github.com/apache/arrow-rs/pull/2094) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- `DecimalBuilder` should use `FixedSizeBinaryBuilder` [\#2092](https://github.com/apache/arrow-rs/pull/2092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Array writer indirection [\#2091](https://github.com/apache/arrow-rs/pull/2091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove doc hidden from GenericColumnReader [\#2090](https://github.com/apache/arrow-rs/pull/2090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support skip\_values in ColumnValueDecoderImpl [\#2089](https://github.com/apache/arrow-rs/pull/2089) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- type\_id and value\_offset are incorrect for sliced UnionArray [\#2087](https://github.com/apache/arrow-rs/pull/2087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add IPC truncation test case for StructArray [\#2083](https://github.com/apache/arrow-rs/pull/2083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve performance of set\_bits by using copy\_from\_slice instead of setting individual bytes [\#2077](https://github.com/apache/arrow-rs/pull/2077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Support skip\_values in ByteArrayColumnValueDecoder [\#2076](https://github.com/apache/arrow-rs/pull/2076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Lazily materialize the null buffer builder of boolean builder [\#2073](https://github.com/apache/arrow-rs/pull/2073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix windows CI \(\#2069\) [\#2070](https://github.com/apache/arrow-rs/pull/2070) ([tustvold](https://github.com/tustvold)) -- Test utf8\_validation checks char boundaries [\#2068](https://github.com/apache/arrow-rs/pull/2068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat\(compute\): Support doy \(day of year\) for temporal [\#2067](https://github.com/apache/arrow-rs/pull/2067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) -- Support nullable indices in boolean take kernel and some optimizations [\#2064](https://github.com/apache/arrow-rs/pull/2064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Improve performance of filter\_dict [\#2063](https://github.com/apache/arrow-rs/pull/2063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Ignore null buffer when creating ArrayData if null count is zero [\#2056](https://github.com/apache/arrow-rs/pull/2056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- feat\(compute\): Support week0 \(PostgreSQL behaviour\) for temporal [\#2052](https://github.com/apache/arrow-rs/pull/2052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) -- Set DICTIONARY\_ORDERED flag for FFI\_ArrowSchema [\#2050](https://github.com/apache/arrow-rs/pull/2050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Generify parquet write path \(\#1764\) [\#2045](https://github.com/apache/arrow-rs/pull/2045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support peek\_next\_page\(\) and skip\_next\_page in serialized\_reader. [\#2044](https://github.com/apache/arrow-rs/pull/2044) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Support MapType in FFI [\#2042](https://github.com/apache/arrow-rs/pull/2042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add support of converting `FixedSizeBinaryArray` to `DecimalArray` [\#2041](https://github.com/apache/arrow-rs/pull/2041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Truncate IPC record batch [\#2040](https://github.com/apache/arrow-rs/pull/2040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Refine the List builder [\#2034](https://github.com/apache/arrow-rs/pull/2034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add more tests of RecordReader Batch Size Edge Cases \(\#2025\) [\#2032](https://github.com/apache/arrow-rs/pull/2032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add support for adding intervals to dates [\#2031](https://github.com/apache/arrow-rs/pull/2031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- fix: Fix skip error in calculate\_row\_count. [\#2329](https://github.com/apache/arrow-rs/pull/2329) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- temporal conversion functions should work on negative input properly [\#2326](https://github.com/apache/arrow-rs/pull/2326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Increase DeltaBitPackEncoder miniblock size to 64 for 64-bit integers \(\#2282\) [\#2319](https://github.com/apache/arrow-rs/pull/2319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove JsonEqual [\#2317](https://github.com/apache/arrow-rs/pull/2317) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix: IPC writer should truncate string array with all empty string [\#2314](https://github.com/apache/arrow-rs/pull/2314) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) +- Pass pull `Request` to `FlightSqlService` `impl`s [\#2309](https://github.com/apache/arrow-rs/pull/2309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Speedup take\_boolean / take\_bits for non-null indices \(~4 - 5x speedup\) [\#2307](https://github.com/apache/arrow-rs/pull/2307) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Add typed dictionary \(\#2136\) [\#2297](https://github.com/apache/arrow-rs/pull/2297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[Minor\] Improve types shown in cast error messages [\#2295](https://github.com/apache/arrow-rs/pull/2295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move `with_precision_and_scale` to `BasicDecimalArray` trait [\#2292](https://github.com/apache/arrow-rs/pull/2292) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace the `fn get_data_type` by `const DATA_TYPE` in BinaryArray and StringArray [\#2289](https://github.com/apache/arrow-rs/pull/2289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Clean up string casts and improve performance [\#2284](https://github.com/apache/arrow-rs/pull/2284) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Minor\] Add tests for temporal cast error paths [\#2283](https://github.com/apache/arrow-rs/pull/2283) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add unpack8, unpack16, unpack64 \(\#2276\) ~10-50% faster [\#2278](https://github.com/apache/arrow-rs/pull/2278) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix bugs in the `from_list` function. [\#2277](https://github.com/apache/arrow-rs/pull/2277) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- fix: use signed comparator to compare decimal128 and decimal256 [\#2275](https://github.com/apache/arrow-rs/pull/2275) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Use initial capacity for interner hashmap [\#2272](https://github.com/apache/arrow-rs/pull/2272) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Dandandan](https://github.com/Dandandan)) +- Remove fallibility from paruqet RleEncoder \(\#2226\) [\#2259](https://github.com/apache/arrow-rs/pull/2259) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix escaped like wildcards in `like_utf8` / `nlike_utf8` kernels [\#2258](https://github.com/apache/arrow-rs/pull/2258) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([daniel-martinez-maqueda-sap](https://github.com/daniel-martinez-maqueda-sap)) +- Add tests for reading nested decimal arrays from parquet [\#2254](https://github.com/apache/arrow-rs/pull/2254) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: Implement string cast operations for Time32 and Time64 [\#2251](https://github.com/apache/arrow-rs/pull/2251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([stuartcarnie](https://github.com/stuartcarnie)) +- move `FixedSizeList` to `array_fixed_size_list.rs` [\#2250](https://github.com/apache/arrow-rs/pull/2250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Impl FromIterator for Decimal256Array [\#2247](https://github.com/apache/arrow-rs/pull/2247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix max and min value for decimal precision greater than 38 [\#2245](https://github.com/apache/arrow-rs/pull/2245) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make `Schema::fields` and `Schema::metadata` `pub` \(public\) [\#2239](https://github.com/apache/arrow-rs/pull/2239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Minor\] Improve Schema metadata mismatch error [\#2238](https://github.com/apache/arrow-rs/pull/2238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Separate ArrayReader::next\_batch with read\_records and consume\_batch [\#2237](https://github.com/apache/arrow-rs/pull/2237) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Update `IntervalMonthDayNanoType::make_value()` to conform to specifications [\#2235](https://github.com/apache/arrow-rs/pull/2235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Disable value validation for Decimal256 case [\#2232](https://github.com/apache/arrow-rs/pull/2232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Automatically grow parquet BitWriter \(\#2226\) \(~10% faster\) [\#2231](https://github.com/apache/arrow-rs/pull/2231) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Add append\_option support to decimal builders [\#2225](https://github.com/apache/arrow-rs/pull/2225) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([bphillips-exos](https://github.com/bphillips-exos)) +- Optimized writing of byte array to parquet \(\#1764\) \(2x faster\) [\#2221](https://github.com/apache/arrow-rs/pull/2221) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Increase test coverage of ArrowWriter [\#2220](https://github.com/apache/arrow-rs/pull/2220) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Move `FixedSizeBinaryArray` to `array_fixed_size_binary.rs` [\#2218](https://github.com/apache/arrow-rs/pull/2218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Avoid boxing in PrimitiveDictionaryBuilder [\#2216](https://github.com/apache/arrow-rs/pull/2216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- remove redundant CI benchmark check, cleanups [\#2212](https://github.com/apache/arrow-rs/pull/2212) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Update `FlightSqlService` trait to proxy handshake [\#2211](https://github.com/apache/arrow-rs/pull/2211) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- parquet: export json api with `serde_json` feature name [\#2209](https://github.com/apache/arrow-rs/pull/2209) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([flisky](https://github.com/flisky)) +- Cleanup record skipping logic and tests \(\#2158\) [\#2199](https://github.com/apache/arrow-rs/pull/2199) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use BitChunks in equal\_bits [\#2194](https://github.com/apache/arrow-rs/pull/2194) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix disabling parquet statistics \(\#2185\) [\#2191](https://github.com/apache/arrow-rs/pull/2191) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Fix offset handling in boolean\_equal \(\#2184\) [\#2187](https://github.com/apache/arrow-rs/pull/2187) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement `Hash` for `Schema` [\#2183](https://github.com/apache/arrow-rs/pull/2183) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Let the `StringBuilder` use `BinaryBuilder` [\#2181](https://github.com/apache/arrow-rs/pull/2181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Use ArrayAccessor and FromIterator in Cast Kernels [\#2169](https://github.com/apache/arrow-rs/pull/2169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- \[minor\] use type aliases refine code. [\#2161](https://github.com/apache/arrow-rs/pull/2161) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- parquet reader: Support reading decimals from parquet `BYTE_ARRAY` type [\#2160](https://github.com/apache/arrow-rs/pull/2160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Add integration test for scan rows with selection [\#2158](https://github.com/apache/arrow-rs/pull/2158) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Use ArrayAccessor in Comparison Kernels [\#2157](https://github.com/apache/arrow-rs/pull/2157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement `peek\_next\_page` and `skip\_next\_page` for `InMemoryColumnCh… [\#2155](https://github.com/apache/arrow-rs/pull/2155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Avoid decoding unneeded values in ByteArrayDecoderDictionary [\#2154](https://github.com/apache/arrow-rs/pull/2154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Update rust version to 1.62 [\#2144](https://github.com/apache/arrow-rs/pull/2144) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Make MapFieldNames public \(\#2118\) [\#2134](https://github.com/apache/arrow-rs/pull/2134) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ArrayAccessor trait, remove duplication in array iterators \(\#1948\) [\#2133](https://github.com/apache/arrow-rs/pull/2133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Lazily materialize the null buffer builder for all array builders. [\#2127](https://github.com/apache/arrow-rs/pull/2127) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Faster parquet DictEncoder \(~20%\) [\#2123](https://github.com/apache/arrow-rs/pull/2123) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add validation for Decimal256 [\#2113](https://github.com/apache/arrow-rs/pull/2113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support skip\_def\_levels for ColumnLevelDecoder [\#2111](https://github.com/apache/arrow-rs/pull/2111) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Donate `object_store` code from object\_store\_rs to arrow-rs [\#2081](https://github.com/apache/arrow-rs/pull/2081) ([alamb](https://github.com/alamb)) +- Improve `validate_utf8` performance [\#2048](https://github.com/apache/arrow-rs/pull/2048) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 73f297d011f4..92c6aac3d082 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "19.0.0" +version = "20.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "19.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "20.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index cbe10d9bec74..db9b75377d29 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "19.0.0" +arrow-flight = "20.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 7e2a759c5590..aa0d407113d7 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -16,7 +16,7 @@ // under the License. use arrow_flight::sql::{ActionCreatePreparedStatementResult, SqlInfo}; -use arrow_flight::{FlightData, HandshakeRequest, HandshakeResponse}; +use arrow_flight::{Action, FlightData, HandshakeRequest, HandshakeResponse, Ticket}; use futures::Stream; use std::pin::Pin; use tonic::transport::Server; @@ -93,179 +93,253 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_statement( &self, _query: CommandStatementQuery, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_statement not implemented", + )) } + async fn get_flight_info_prepared_statement( &self, _query: CommandPreparedStatementQuery, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_prepared_statement not implemented", + )) } + async fn get_flight_info_catalogs( &self, _query: CommandGetCatalogs, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_catalogs not implemented", + )) } + async fn get_flight_info_schemas( &self, _query: CommandGetDbSchemas, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_schemas not implemented", + )) } + async fn get_flight_info_tables( &self, _query: CommandGetTables, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_tables not implemented", + )) } + async fn get_flight_info_table_types( &self, _query: CommandGetTableTypes, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_table_types not implemented", + )) } + async fn get_flight_info_sql_info( &self, _query: CommandGetSqlInfo, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_sql_info not implemented", + )) } + async fn get_flight_info_primary_keys( &self, _query: CommandGetPrimaryKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_primary_keys not implemented", + )) } + async fn get_flight_info_exported_keys( &self, _query: CommandGetExportedKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_exported_keys not implemented", + )) } + async fn get_flight_info_imported_keys( &self, _query: CommandGetImportedKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) } + async fn get_flight_info_cross_reference( &self, _query: CommandGetCrossReference, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) } + // do_get async fn do_get_statement( &self, _ticket: TicketStatementQuery, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_statement not implemented")) } async fn do_get_prepared_statement( &self, _query: CommandPreparedStatementQuery, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_prepared_statement not implemented", + )) } + async fn do_get_catalogs( &self, _query: CommandGetCatalogs, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_catalogs not implemented")) } + async fn do_get_schemas( &self, _query: CommandGetDbSchemas, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_schemas not implemented")) } + async fn do_get_tables( &self, _query: CommandGetTables, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_tables not implemented")) } + async fn do_get_table_types( &self, _query: CommandGetTableTypes, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_table_types not implemented")) } + async fn do_get_sql_info( &self, _query: CommandGetSqlInfo, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_sql_info not implemented")) } + async fn do_get_primary_keys( &self, _query: CommandGetPrimaryKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_primary_keys not implemented")) } + async fn do_get_exported_keys( &self, _query: CommandGetExportedKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_exported_keys not implemented", + )) } + async fn do_get_imported_keys( &self, _query: CommandGetImportedKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_imported_keys not implemented", + )) } + async fn do_get_cross_reference( &self, _query: CommandGetCrossReference, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_cross_reference not implemented", + )) } + // do_put async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, + _request: Request>, ) -> Result { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_statement_update not implemented", + )) } + async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Streaming, + _request: Request>, ) -> Result::DoPutStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_prepared_statement_query not implemented", + )) } + async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Streaming, + _request: Request>, ) -> Result { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_prepared_statement_update not implemented", + )) } + // do_action async fn do_action_create_prepared_statement( &self, _query: ActionCreatePreparedStatementRequest, + _request: Request, ) -> Result { Err(Status::unimplemented("Not yet implemented")) } async fn do_action_close_prepared_statement( &self, _query: ActionClosePreparedStatementRequest, + _request: Request, ) { unimplemented!("Not yet implemented") } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 5cfbd3f60657..3f4f09855353 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -28,6 +28,7 @@ use std::{ ops::Deref, }; +#[allow(clippy::derive_partial_eq_without_eq)] mod gen { include!("arrow.flight.protocol.rs"); } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 2d9d88638588..6e8f104dc5b8 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -65,77 +65,77 @@ pub trait FlightSqlService: async fn get_flight_info_statement( &self, query: CommandStatementQuery, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for executing an already created prepared statement. async fn get_flight_info_prepared_statement( &self, query: CommandPreparedStatementQuery, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing catalogs. async fn get_flight_info_catalogs( &self, query: CommandGetCatalogs, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing schemas. async fn get_flight_info_schemas( &self, query: CommandGetDbSchemas, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing tables. async fn get_flight_info_tables( &self, query: CommandGetTables, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about the table types. async fn get_flight_info_table_types( &self, query: CommandGetTableTypes, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for retrieving other information (See SqlInfo). async fn get_flight_info_sql_info( &self, query: CommandGetSqlInfo, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about primary and foreign keys. async fn get_flight_info_primary_keys( &self, query: CommandGetPrimaryKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about exported keys. async fn get_flight_info_exported_keys( &self, query: CommandGetExportedKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about imported keys. async fn get_flight_info_imported_keys( &self, query: CommandGetImportedKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about cross reference. async fn get_flight_info_cross_reference( &self, query: CommandGetCrossReference, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; // do_get @@ -144,66 +144,77 @@ pub trait FlightSqlService: async fn do_get_statement( &self, ticket: TicketStatementQuery, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the prepared statement query results. async fn do_get_prepared_statement( &self, query: CommandPreparedStatementQuery, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of catalogs. async fn do_get_catalogs( &self, query: CommandGetCatalogs, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of schemas. async fn do_get_schemas( &self, query: CommandGetDbSchemas, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of tables. async fn do_get_tables( &self, query: CommandGetTables, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the table types. async fn do_get_table_types( &self, query: CommandGetTableTypes, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of SqlInfo results. async fn do_get_sql_info( &self, query: CommandGetSqlInfo, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the primary and foreign keys. async fn do_get_primary_keys( &self, query: CommandGetPrimaryKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the exported keys. async fn do_get_exported_keys( &self, query: CommandGetExportedKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the imported keys. async fn do_get_imported_keys( &self, query: CommandGetImportedKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the cross reference. async fn do_get_cross_reference( &self, query: CommandGetCrossReference, + request: Request, ) -> Result::DoGetStream>, Status>; // do_put @@ -212,20 +223,21 @@ pub trait FlightSqlService: async fn do_put_statement_update( &self, ticket: CommandStatementUpdate, + request: Request>, ) -> Result; /// Bind parameters to given prepared statement. async fn do_put_prepared_statement_query( &self, query: CommandPreparedStatementQuery, - request: Streaming, + request: Request>, ) -> Result::DoPutStream>, Status>; /// Execute an update SQL prepared statement. async fn do_put_prepared_statement_update( &self, query: CommandPreparedStatementUpdate, - request: Streaming, + request: Request>, ) -> Result; // do_action @@ -234,12 +246,14 @@ pub trait FlightSqlService: async fn do_action_create_prepared_statement( &self, query: ActionCreatePreparedStatementRequest, + request: Request, ) -> Result; /// Close a prepared statement. async fn do_action_close_prepared_statement( &self, query: ActionClosePreparedStatementRequest, + request: Request, ); /// Register a new SqlInfo result, making it available when calling GetSqlInfo. @@ -287,119 +301,87 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); let any: prost_types::Any = - prost::Message::decode(&*request.cmd).map_err(decode_error_to_status)?; + Message::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; if any.is::() { - return self - .get_flight_info_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_statement(token, request).await; } if any.is::() { + let handle = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); return self - .get_flight_info_prepared_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) + .get_flight_info_prepared_statement(handle, request) .await; } if any.is::() { - return self - .get_flight_info_catalogs( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_catalogs(token, request).await; } if any.is::() { - return self - .get_flight_info_schemas( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_schemas(token, request).await; } if any.is::() { - return self - .get_flight_info_tables( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_tables(token, request).await; } if any.is::() { - return self - .get_flight_info_table_types( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_table_types(token, request).await; } if any.is::() { - return self - .get_flight_info_sql_info( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_sql_info(token, request).await; } if any.is::() { - return self - .get_flight_info_primary_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_primary_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_exported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_exported_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_imported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_imported_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_cross_reference( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_cross_reference(token, request).await; } Err(Status::unimplemented(format!( @@ -419,161 +401,131 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); - let any: prost_types::Any = - prost::Message::decode(&*request.ticket).map_err(decode_error_to_status)?; + let any: prost_types::Any = prost::Message::decode(&*request.get_ref().ticket) + .map_err(decode_error_to_status)?; if any.is::() { - return self - .do_get_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_statement(token, request).await; } if any.is::() { - return self - .do_get_prepared_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_prepared_statement(token, request).await; } if any.is::() { - return self - .do_get_catalogs( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_catalogs(token, request).await; } if any.is::() { - return self - .do_get_schemas( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_schemas(token, request).await; } if any.is::() { - return self - .do_get_tables( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_tables(token, request).await; } if any.is::() { - return self - .do_get_table_types( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_table_types(token, request).await; } if any.is::() { - return self - .do_get_sql_info( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_sql_info(token, request).await; } if any.is::() { - return self - .do_get_primary_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_primary_keys(token, request).await; } if any.is::() { - return self - .do_get_exported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_exported_keys(token, request).await; } if any.is::() { - return self - .do_get_imported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_imported_keys(token, request).await; } if any.is::() { - return self - .do_get_cross_reference( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_cross_reference(token, request).await; } Err(Status::unimplemented(format!( "do_get: The defined request is invalid: {:?}", - String::from_utf8(request.ticket).unwrap() + String::from_utf8(request.get_ref().ticket.clone()).unwrap() ))) } async fn do_put( &self, - request: Request>, + mut request: Request>, ) -> Result, Status> { - let mut request = request.into_inner(); - let cmd = request.message().await?.unwrap(); + let cmd = request.get_mut().message().await?.unwrap(); let any: prost_types::Any = prost::Message::decode(&*cmd.flight_descriptor.unwrap().cmd) .map_err(decode_error_to_status)?; if any.is::() { - let record_count = self - .do_put_statement_update( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await?; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + let record_count = self.do_put_statement_update(token, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { - app_metadata: result.as_any().encode_to_vec(), + app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } if any.is::() { - return self - .do_put_prepared_statement_query( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_put_prepared_statement_query(token, request).await; } if any.is::() { + let handle = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); let record_count = self - .do_put_prepared_statement_update( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) + .do_put_prepared_statement_update(handle, request) .await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { - app_metadata: result.as_any().encode_to_vec(), + app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } @@ -614,11 +566,9 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); - - if request.r#type == CREATE_PREPARED_STATEMENT { - let any: prost_types::Any = - prost::Message::decode(&*request.body).map_err(decode_error_to_status)?; + if request.get_ref().r#type == CREATE_PREPARED_STATEMENT { + let any: prost_types::Any = Message::decode(&*request.get_ref().body) + .map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedStatementRequest = any .unpack() @@ -628,15 +578,17 @@ where "Unable to unpack ActionCreatePreparedStatementRequest.", ) })?; - let stmt = self.do_action_create_prepared_statement(cmd).await?; + let stmt = self + .do_action_create_prepared_statement(cmd, request) + .await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { body: stmt.as_any().encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } - if request.r#type == CLOSE_PREPARED_STATEMENT { - let any: prost_types::Any = - prost::Message::decode(&*request.body).map_err(decode_error_to_status)?; + if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { + let any: prost_types::Any = Message::decode(&*request.get_ref().body) + .map_err(decode_error_to_status)?; let cmd: ActionClosePreparedStatementRequest = any .unpack() @@ -646,13 +598,13 @@ where "Unable to unpack ActionClosePreparedStatementRequest.", ) })?; - self.do_action_close_prepared_statement(cmd).await; + self.do_action_close_prepared_statement(cmd, request).await; return Ok(Response::new(Box::pin(futures::stream::empty()))); } Err(Status::invalid_argument(format!( "do_action: The defined request is invalid: {:?}", - request.r#type + request.get_ref().r#type ))) } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 6139ff7702c5..19117ba5f03e 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "19.0.0" +version = "20.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "19.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "20.0.0", features = ["pyarrow"] } pyo3 = { version = "0.16", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2bd0885b163d..dbc606ad20e3 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "19.0.0" +version = "20.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -37,8 +37,13 @@ name = "arrow" path = "src/lib.rs" bench = false +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features=["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features=["runtime-rng"] } + [dependencies] -ahash = { version = "0.7", default-features = false } serde = { version = "1.0", default-features = false } serde_derive = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } @@ -73,11 +78,13 @@ prettyprint = ["comfy-table"] # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. test_utils = ["rand"] -pyarrow = ["pyo3"] +pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive # but is run as part of our CI checks force_validate = [] +# Enable ffi support +ffi = [] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } @@ -90,6 +97,7 @@ tempfile = { version = "3", default-features = false } [[example]] name = "dynamic_types" required-features = ["prettyprint"] +path="./examples/dynamic_types.rs" [[bench]] name = "aggregate_kernels" @@ -201,3 +209,7 @@ required-features = ["test_utils"] [[bench]] name = "array_data_validate" harness = false + +[[bench]] +name = "decimal_validate" +harness = false diff --git a/arrow/README.md b/arrow/README.md index d26a4f410c23..f7ccb9696455 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -22,7 +22,10 @@ [![crates.io](https://img.shields.io/crates/v/arrow.svg)](https://crates.io/crates/arrow) [![docs.rs](https://img.shields.io/docsrs/arrow.svg)](https://docs.rs/arrow/latest/arrow/) -This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. Additional details can be found on [crates.io](https://crates.io/crates/arrow), [docs.rs](https://docs.rs/arrow/latest/arrow/) and [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples). +This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. + +The [crate documentation](https://docs.rs/arrow/latest/arrow/) contains examples and full API. +There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. ## Rust Version Compatibility @@ -32,20 +35,26 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `19.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `20.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. -## Features +## Feature Flags -The arrow crate provides the following features which may be enabled: +The `arrow` crate provides the following features which may be enabled in your `Cargo.toml`: - `csv` (default) - support for reading and writing Arrow arrays to/from csv files - `ipc` (default) - support for the [arrow-flight](https://crates.io/crates/arrow-flight) IPC and wire format - `prettyprint` - support for formatting record batches as textual columns - `js` - support for building arrow for WebAssembly / JavaScript -- `simd` - (_Requires Nightly Rust_) alternate optimized +- `simd` - (_Requires Nightly Rust_) Use alternate hand optimized implementations of some [compute](https://github.com/apache/arrow-rs/tree/master/arrow/src/compute/kernels) - kernels using explicit SIMD instructions available through [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). + kernels using explicit SIMD instructions via [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). - `chrono-tz` - support of parsing timezone using [chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/) +- `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) +- `pyarrow` - bindings for pyo3 to call arrow-rs from python + +## Arrow Feature Status + +The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists which features of Arrow this crate supports. ## Safety @@ -55,25 +64,25 @@ Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here] Where soundness in turn is defined as: -> Code is unable to trigger undefined behaviour using safe APIs +> Code is unable to trigger undefined behavior using safe APIs -One way to ensure this would be to not use `unsafe`, however, as described in the opening chapter of the [Rustonomicon](https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html) this is not a requirement, and flexibility in this regard is actually one of Rust's great strengths. +One way to ensure this would be to not use `unsafe`, however, as described in the opening chapter of the [Rustonomicon](https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html) this is not a requirement, and flexibility in this regard is one of Rust's great strengths. In particular there are a number of scenarios where `unsafe` is largely unavoidable: -* Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... -* FFI -* SIMD +- Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... +- FFI +- SIMD -Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. +Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. We have a number of strategies to help reduce this risk: -* Provide strongly-typed `Array` and `ArrayBuilder` APIs to safely and efficiently interact with arrays -* Extensive validation logic to safely construct `ArrayData` from untrusted sources -* All commits are verified using [MIRI](https://github.com/rust-lang/miri) to detect undefined behaviour -* We provide a `force_validate` feature that enables additional validation checks for use in test/debug builds -* There is ongoing work to reduce and better document the use of unsafe, and we welcome contributions in this space +- Provide strongly-typed `Array` and `ArrayBuilder` APIs to safely and efficiently interact with arrays +- Extensive validation logic to safely construct `ArrayData` from untrusted sources +- All commits are verified using [MIRI](https://github.com/rust-lang/miri) to detect undefined behaviour +- Use a `force_validate` feature that enables additional validation checks for use in test/debug builds +- There is ongoing work to reduce and better document the use of unsafe, and we welcome contributions in this space ## Building for WASM @@ -101,16 +110,38 @@ cargo run --example read_csv [arrow]: https://arrow.apache.org/ +## Performance Tips -## Performance +Arrow aims to be as fast as possible out of the box, whilst not compromising on safety. However, +it relies heavily on LLVM auto-vectorisation to achieve this. Unfortunately the LLVM defaults, +particularly for x86_64, favour portability over performance, and LLVM will consequently avoid +using more recent instructions that would result in errors on older CPUs. -Most of the compute kernels benefit a lot from being optimized for a specific CPU target. -This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions. -One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly: +To address this it is recommended that you override the LLVM defaults either +by setting the `RUSTFLAGS` environment variable, or by setting `rustflags` in your +[Cargo configuration](https://doc.rust-lang.org/cargo/reference/config.html) - - `native`: Target the exact features of the cpu that the build is running on. - This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software. - - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu. - - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors. +Enable all features supported by the current CPU -These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library. \ No newline at end of file +```ignore +RUSTFLAGS="-C target-cpu=native" +``` + +Enable all features supported by the current CPU, and enable full use of AVX512 + +```ignore +RUSTFLAGS="-C target-cpu=native -C target-feature=-prefer-256-bit" +``` + +Enable all features supported by CPUs more recent than haswell (2013) + +```ignore +RUSTFLAGS="-C target-cpu=haswell" +``` + +For a full list of features and target CPUs use + +```shell +$ rustc --print target-cpus +$ rustc --print target-features +``` diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs new file mode 100644 index 000000000000..1c726341e177 --- /dev/null +++ b/arrow/benches/decimal_validate.rs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; + +use arrow::array::{Array, Decimal128Array, Decimal256Array, Decimal256Builder}; +use criterion::Criterion; + +extern crate arrow; + +use arrow::util::decimal::Decimal256; + +fn validate_decimal128_array(array: Decimal128Array) { + array.with_precision_and_scale(35, 0).unwrap(); +} + +fn validate_decimal256_array(array: Decimal256Array) { + array.with_precision_and_scale(35, 0).unwrap(); +} + +fn validate_decimal128_benchmark(c: &mut Criterion) { + let decimal_array = Decimal128Array::from_iter_values(vec![12324; 20000]); + let data = decimal_array.into_data(); + c.bench_function("validate_decimal128_array 20000", |b| { + b.iter(|| { + let array = Decimal128Array::from(data.clone()); + validate_decimal128_array(array); + }) + }); +} + +fn validate_decimal256_benchmark(c: &mut Criterion) { + let mut decimal_builder = Decimal256Builder::new(20000, 76, 0); + let mut bytes = vec![0; 32]; + bytes[0..16].clone_from_slice(&12324_i128.to_le_bytes()); + for _ in 0..20000 { + decimal_builder + .append_value(&Decimal256::new(76, 0, &bytes)) + .unwrap(); + } + let decimal_array256_data = decimal_builder.finish(); + let data = decimal_array256_data.into_data(); + c.bench_function("validate_decimal256_array 20000", |b| { + b.iter(|| { + let array = Decimal256Array::from(data.clone()); + validate_decimal256_array(array); + }) + }); +} + +criterion_group!( + benches, + validate_decimal128_benchmark, + validate_decimal256_benchmark, +); +criterion_main!(benches); diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 422916996cde..9766f857c727 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -16,19 +16,16 @@ // under the License. use std::any::Any; -use std::convert::{From, TryFrom}; +use std::convert::From; use std::fmt; use std::sync::Arc; use super::*; -use crate::array::equal_json::JsonEqual; use crate::buffer::{Buffer, MutableBuffer}; -use crate::error::Result; -use crate::ffi; /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. -pub trait Array: fmt::Debug + Send + Sync + JsonEqual { +pub trait Array: fmt::Debug + Send + Sync { /// Returns the array as [`Any`](std::any::Any) so that it can be /// downcasted to a specific implementation. /// @@ -216,15 +213,6 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual { self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) - std::mem::size_of::() } - - /// returns two pointers that represent this array in the C Data Interface (FFI) - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - let data = self.data().clone(); - let array = ffi::ArrowArray::try_from(data)?; - Ok(ffi::ArrowArray::into_raw(array)) - } } /// A reference-counted reference to a generic `Array`. @@ -287,14 +275,6 @@ impl Array for ArrayRef { fn get_array_memory_size(&self) -> usize { self.as_ref().get_array_memory_size() } - - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - let data = self.data().clone(); - let array = ffi::ArrowArray::try_from(data)?; - Ok(ffi::ArrowArray::into_raw(array)) - } } impl<'a, T: Array> Array for &'a T { @@ -353,12 +333,6 @@ impl<'a, T: Array> Array for &'a T { fn get_array_memory_size(&self) -> usize { T::get_array_memory_size(self) } - - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - T::to_raw(self) - } } /// A generic trait for accessing the values of an [`Array`] @@ -733,42 +707,6 @@ fn new_null_sized_decimal( }) } -/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -pub unsafe fn make_array_from_raw( - array: *const ffi::FFI_ArrowArray, - schema: *const ffi::FFI_ArrowSchema, -) -> Result { - let array = ffi::ArrowArray::try_from_raw(array, schema)?; - let data = ArrayData::try_from(array)?; - Ok(make_array(data)) -} - -/// Exports an array to raw pointers of the C Data Interface provided by the consumer. -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -/// -/// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and -/// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. -/// Usually the raw pointers are provided by the array data consumer. -pub unsafe fn export_array_into_raw( - src: ArrayRef, - out_array: *mut ffi::FFI_ArrowArray, - out_schema: *mut ffi::FFI_ArrowSchema, -) -> Result<()> { - let data = src.data(); - let array = ffi::FFI_ArrowArray::new(data); - let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?; - - std::ptr::write_unaligned(out_array, array); - std::ptr::write_unaligned(out_schema, schema); - - Ok(()) -} - // Helper function for printing potentially long arrays. pub(super) fn print_long_array( array: &A, diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index dd21e0d51763..12c6978107d1 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -236,7 +236,7 @@ impl<'a, T: OffsetSizeTrait> GenericBinaryArray { impl fmt::Debug for GenericBinaryArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}BinaryArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { @@ -608,11 +608,9 @@ mod tests { .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt8, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); let array_data2 = ArrayData::builder(data_type) .len(3) @@ -660,11 +658,9 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref(&[0b101]); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt8, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -707,11 +703,9 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt8, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 9d7644befd6e..455383124c03 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayAccessor, Decimal128Iter, Decimal256Iter}; +use crate::array::ArrayAccessor; use num::BigInt; use std::borrow::Borrow; use std::convert::From; @@ -25,17 +25,16 @@ use std::{any::Any, iter::FromIterator}; use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, }; -use super::{BooleanBufferBuilder, FixedSizeBinaryArray}; +use super::{BasicDecimalIter, BooleanBufferBuilder, FixedSizeBinaryArray}; #[allow(deprecated)] pub use crate::array::DecimalIter; use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::{validate_decimal256_precision_with_lt_bytes, DataType}; use crate::datatypes::{ - validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, + validate_decimal_precision, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE, }; -use crate::datatypes::{DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE}; use crate::error::{ArrowError, Result}; -use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; +use crate::util::decimal::{BasicDecimal, Decimal256}; /// `Decimal128Array` stores fixed width decimal numbers, /// with a fixed precision and scale. @@ -71,44 +70,41 @@ use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; /// assert_eq!(6, decimal_array.scale()); /// ``` /// -pub struct Decimal128Array { - data: ArrayData, - value_data: RawPtrBox, - precision: usize, - scale: usize, -} +pub type Decimal128Array = BasicDecimalArray<16>; + +pub type Decimal256Array = BasicDecimalArray<32>; -pub struct Decimal256Array { +pub struct BasicDecimalArray { data: ArrayData, value_data: RawPtrBox, precision: usize, scale: usize, } -mod private_decimal { - pub trait DecimalArrayPrivate { - fn raw_value_data_ptr(&self) -> *const u8; - } -} - -pub trait BasicDecimalArray>: - private_decimal::DecimalArrayPrivate -{ - const VALUE_LENGTH: i32; - const DEFAULT_TYPE: DataType; - const MAX_PRECISION: usize; - const MAX_SCALE: usize; +impl BasicDecimalArray { + pub const VALUE_LENGTH: i32 = BYTE_WIDTH as i32; + const DEFAULT_TYPE: DataType = BasicDecimal::::DEFAULT_TYPE; + pub const MAX_PRECISION: usize = BasicDecimal::::MAX_PRECISION; + pub const MAX_SCALE: usize = BasicDecimal::::MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(usize, usize) -> DataType = + BasicDecimal::::TYPE_CONSTRUCTOR; - fn data(&self) -> &ArrayData; + pub fn data(&self) -> &ArrayData { + &self.data + } /// Return the precision (total digits) that can be stored by this array - fn precision(&self) -> usize; + pub fn precision(&self) -> usize { + self.precision + } /// Return the scale (digits after the decimal) that can be stored by this array - fn scale(&self) -> usize; + pub fn scale(&self) -> usize { + self.scale + } /// Returns the element at index `i`. - fn value(&self, i: usize) -> T { + pub fn value(&self, i: usize) -> BasicDecimal { assert!(i < self.data().len(), "Out of bounds access"); unsafe { self.value_unchecked(i) } @@ -117,7 +113,7 @@ pub trait BasicDecimalArray>: /// Returns the element at index `i`. /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array - unsafe fn value_unchecked(&self, i: usize) -> T { + pub unsafe fn value_unchecked(&self, i: usize) -> BasicDecimal { let data = self.data(); let offset = i + data.offset(); let raw_val = { @@ -127,14 +123,14 @@ pub trait BasicDecimalArray>: Self::VALUE_LENGTH as usize, ) }; - T::new(self.precision(), self.scale(), raw_val) + BasicDecimal::::new(self.precision(), self.scale(), raw_val) } /// Returns the offset for the element at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - fn value_offset(&self, i: usize) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data().offset() + i) } @@ -142,22 +138,22 @@ pub trait BasicDecimalArray>: /// /// All elements have the same length as the array is a fixed size. #[inline] - fn value_length(&self) -> i32 { + pub fn value_length(&self) -> i32 { Self::VALUE_LENGTH } /// Returns a clone of the value data buffer - fn value_data(&self) -> Buffer { + pub fn value_data(&self) -> Buffer { self.data().buffers()[0].clone() } #[inline] - fn value_offset_at(&self, i: usize) -> i32 { + pub fn value_offset_at(&self, i: usize) -> i32 { Self::VALUE_LENGTH * i as i32 } #[inline] - fn value_as_string(&self, row: usize) -> String { + pub fn value_as_string(&self, row: usize) -> String { self.value(row).to_string() } @@ -165,11 +161,11 @@ pub trait BasicDecimalArray>: /// /// NB: This function does not validate that each value is in the permissible /// range for a decimal - fn from_fixed_size_binary_array( + pub fn from_fixed_size_binary_array( v: FixedSizeBinaryArray, precision: usize, scale: usize, - ) -> U { + ) -> Self { assert!( v.value_length() == Self::VALUE_LENGTH, "Value length of the array ({}) must equal to the byte width of the decimal ({})", @@ -184,7 +180,7 @@ pub trait BasicDecimalArray>: let builder = v.into_data().into_builder().data_type(data_type); let array_data = unsafe { builder.build_unchecked() }; - U::from(array_data) + Self::from(array_data) } /// Build a decimal array from [`FixedSizeListArray`]. @@ -192,11 +188,11 @@ pub trait BasicDecimalArray>: /// NB: This function does not validate that each value is in the permissible /// range for a decimal. #[deprecated(note = "please use `from_fixed_size_binary_array` instead")] - fn from_fixed_size_list_array( + pub fn from_fixed_size_list_array( v: FixedSizeListArray, precision: usize, scale: usize, - ) -> U { + ) -> Self { assert_eq!( v.data_ref().child_data().len(), 1, @@ -242,14 +238,47 @@ pub trait BasicDecimalArray>: .offset(list_offset); let array_data = unsafe { builder.build_unchecked() }; - U::from(array_data) + Self::from(array_data) } /// The default precision and scale used when not specified. - fn default_type() -> DataType { + pub const fn default_type() -> DataType { Self::DEFAULT_TYPE } + fn raw_value_data_ptr(&self) -> *const u8 { + self.value_data.as_ptr() + } +} + +impl Decimal128Array { + /// Creates a [Decimal128Array] with default precision and scale, + /// based on an iterator of `i128` values without nulls + pub fn from_iter_values>(iter: I) -> Self { + let val_buf: Buffer = iter.into_iter().collect(); + let data = unsafe { + ArrayData::new_unchecked( + Self::default_type(), + val_buf.len() / std::mem::size_of::(), + None, + None, + 0, + vec![val_buf], + vec![], + ) + }; + Decimal128Array::from(data) + } + + // Validates decimal values in this array can be properly interpreted + // with the specified precision. + fn validate_decimal_precision(&self, precision: usize) -> Result<()> { + for v in self.iter().flatten() { + validate_decimal_precision(v.as_i128(), precision)?; + } + Ok(()) + } + /// Returns a Decimal array with the same data as self, with the /// specified precision. /// @@ -257,7 +286,7 @@ pub trait BasicDecimalArray>: /// 1. `precision` is larger than [`Self::MAX_PRECISION`] /// 2. `scale` is larger than [`Self::MAX_SCALE`]; /// 3. `scale` is > `precision` - fn with_precision_and_scale(self, precision: usize, scale: usize) -> Result + pub fn with_precision_and_scale(self, precision: usize, scale: usize) -> Result where Self: Sized, { @@ -285,140 +314,101 @@ pub trait BasicDecimalArray>: // Ensure that all values are within the requested // precision. For performance, only check if the precision is // decreased - self.validate_decimal_precision(precision)?; + if precision < self.precision { + self.validate_decimal_precision(precision)?; + } - let data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal128(self.precision(), self.scale()) - } else { - DataType::Decimal256(self.precision(), self.scale()) - }; + let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale); assert_eq!(self.data().data_type(), &data_type); // safety: self.data is valid DataType::Decimal as checked above - let new_data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal128(precision, scale) - } else { - DataType::Decimal256(precision, scale) - }; + let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); Ok(self.data().clone().with_data_type(new_data_type).into()) } - - /// Validates decimal values in this array can be properly interpreted - /// with the specified precision. - fn validate_decimal_precision(&self, precision: usize) -> Result<()>; } -impl BasicDecimalArray for Decimal128Array { - const VALUE_LENGTH: i32 = 16; - const DEFAULT_TYPE: DataType = - DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); - const MAX_PRECISION: usize = DECIMAL128_MAX_PRECISION; - const MAX_SCALE: usize = DECIMAL128_MAX_SCALE; - - fn data(&self) -> &ArrayData { - &self.data - } - - fn precision(&self) -> usize { - self.precision - } - - fn scale(&self) -> usize { - self.scale - } - +impl Decimal256Array { + // Validates decimal values in this array can be properly interpreted + // with the specified precision. fn validate_decimal_precision(&self, precision: usize) -> Result<()> { - if precision < self.precision { - for v in self.iter().flatten() { - validate_decimal_precision(v.as_i128(), precision)?; + (0..self.len()).try_for_each(|idx| { + if self.is_valid(idx) { + let raw_val = unsafe { + let pos = self.value_offset(idx); + std::slice::from_raw_parts( + self.raw_value_data_ptr().offset(pos as isize), + Self::VALUE_LENGTH as usize, + ) + }; + validate_decimal256_precision_with_lt_bytes(raw_val, precision) + } else { + Ok(()) } - } - Ok(()) + }) } -} - -impl BasicDecimalArray for Decimal256Array { - const VALUE_LENGTH: i32 = 32; - const DEFAULT_TYPE: DataType = - DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); - const MAX_PRECISION: usize = DECIMAL256_MAX_PRECISION; - const MAX_SCALE: usize = DECIMAL256_MAX_SCALE; - fn data(&self) -> &ArrayData { - &self.data - } - - fn precision(&self) -> usize { - self.precision - } - - fn scale(&self) -> usize { - self.scale - } + /// Returns a Decimal array with the same data as self, with the + /// specified precision. + /// + /// Returns an Error if: + /// 1. `precision` is larger than [`Self::MAX_PRECISION`] + /// 2. `scale` is larger than [`Self::MAX_SCALE`]; + /// 3. `scale` is > `precision` + pub fn with_precision_and_scale(self, precision: usize, scale: usize) -> Result + where + Self: Sized, + { + if precision > Self::MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "precision {} is greater than max {}", + precision, + Self::MAX_PRECISION + ))); + } + if scale > Self::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than max {}", + scale, + Self::MAX_SCALE + ))); + } + if scale > precision { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than precision {}", + scale, precision + ))); + } - fn validate_decimal_precision(&self, precision: usize) -> Result<()> { + // Ensure that all values are within the requested + // precision. For performance, only check if the precision is + // decreased if precision < self.precision { - for v in self.iter().flatten() { - validate_decimal256_precision(&v.to_string(), precision)?; - } + self.validate_decimal_precision(precision)?; } - Ok(()) - } -} -impl Decimal128Array { - /// Creates a [Decimal128Array] with default precision and scale, - /// based on an iterator of `i128` values without nulls - pub fn from_iter_values>(iter: I) -> Self { - let val_buf: Buffer = iter.into_iter().collect(); - let data = unsafe { - ArrayData::new_unchecked( - Self::default_type(), - val_buf.len() / std::mem::size_of::(), - None, - None, - 0, - vec![val_buf], - vec![], - ) - }; - Decimal128Array::from(data) - } -} + let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale); + assert_eq!(self.data().data_type(), &data_type); -impl From for Decimal128Array { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "Decimal128Array data should contain 1 buffer only (values)" - ); - let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match data.data_type() { - DataType::Decimal128(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal"), - }; - Self { - data, - value_data: unsafe { RawPtrBox::new(values) }, - precision, - scale, - } + // safety: self.data is valid DataType::Decimal as checked above + let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); + + Ok(self.data().clone().with_data_type(new_data_type).into()) } } -impl From for Decimal256Array { +impl From for BasicDecimalArray { fn from(data: ArrayData) -> Self { assert_eq!( data.buffers().len(), 1, - "Decimal256Array data should contain 1 buffer only (values)" + "DecimalArray data should contain 1 buffer only (values)" ); let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match data.data_type() { - DataType::Decimal256(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal256"), + let (precision, scale) = match (data.data_type(), BYTE_WIDTH) { + (DataType::Decimal128(precision, scale), 16) + | (DataType::Decimal256(precision, scale), 32) => (*precision, *scale), + _ => panic!("Expected data type to be Decimal"), }; Self { data, @@ -446,17 +436,13 @@ impl From for Decimal256 { } } -fn build_decimal_array_from, T>( +fn build_decimal_array_from( null_buf: BooleanBufferBuilder, buffer: Buffer, -) -> U -where - T: BasicDecimal, - U: From, -{ +) -> BasicDecimalArray { let data = unsafe { ArrayData::new_unchecked( - U::default_type(), + BasicDecimalArray::::default_type(), null_buf.len(), None, Some(null_buf.into()), @@ -465,7 +451,7 @@ where vec![], ) }; - U::from(data) + BasicDecimalArray::::from(data) } impl> FromIterator> for Decimal256Array { @@ -488,7 +474,7 @@ impl> FromIterator> for Decimal256Array { } }); - build_decimal_array_from::(null_buf, buffer.into()) + build_decimal_array_from::<32>(null_buf, buffer.into()) } } @@ -513,96 +499,75 @@ impl>> FromIterator for Decimal128Array { }) .collect(); - build_decimal_array_from::(null_buf, buffer) + build_decimal_array_from::<16>(null_buf, buffer) } } -macro_rules! def_decimal_array { - ($ty:ident, $array_name:expr, $decimal_ty:ident, $iter_ty:ident) => { - impl private_decimal::DecimalArrayPrivate for $ty { - fn raw_value_data_ptr(&self) -> *const u8 { - self.value_data.as_ptr() - } - } - - impl Array for $ty { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } +impl Array for BasicDecimalArray { + fn as_any(&self) -> &dyn Any { + self + } - fn into_data(self) -> ArrayData { - self.into() - } - } + fn data(&self) -> &ArrayData { + &self.data + } - impl From<$ty> for ArrayData { - fn from(array: $ty) -> Self { - array.data - } - } + fn into_data(self) -> ArrayData { + self.into() + } +} - impl fmt::Debug for $ty { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{}<{}, {}>\n[\n", - $array_name, self.precision, self.scale - )?; - print_long_array(self, f, |array, index, f| { - let formatted_decimal = array.value_as_string(index); - - write!(f, "{}", formatted_decimal) - })?; - write!(f, "]") - } - } +impl From> for ArrayData { + fn from(array: BasicDecimalArray) -> Self { + array.data + } +} - impl<'a> ArrayAccessor for &'a $ty { - type Item = $decimal_ty; +impl fmt::Debug for BasicDecimalArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Decimal{}Array<{}, {}>\n[\n", + BYTE_WIDTH * 8, + self.precision, + self.scale + )?; + print_long_array(self, f, |array, index, f| { + let formatted_decimal = array.value_as_string(index); + + write!(f, "{}", formatted_decimal) + })?; + write!(f, "]") + } +} - fn value(&self, index: usize) -> Self::Item { - $ty::value(self, index) - } +impl<'a, const BYTE_WIDTH: usize> ArrayAccessor for &'a BasicDecimalArray { + type Item = BasicDecimal; - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - $ty::value_unchecked(self, index) - } - } + fn value(&self, index: usize) -> Self::Item { + BasicDecimalArray::::value(self, index) + } - impl<'a> IntoIterator for &'a $ty { - type Item = Option<$decimal_ty>; - type IntoIter = $iter_ty<'a>; + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + BasicDecimalArray::::value_unchecked(self, index) + } +} - fn into_iter(self) -> Self::IntoIter { - $iter_ty::<'a>::new(self) - } - } +impl<'a, const BYTE_WIDTH: usize> IntoIterator for &'a BasicDecimalArray { + type Item = Option>; + type IntoIter = BasicDecimalIter<'a, BYTE_WIDTH>; - impl<'a> $ty { - /// constructs a new iterator - pub fn iter(&'a self) -> $iter_ty<'a> { - $iter_ty::<'a>::new(self) - } - } - }; + fn into_iter(self) -> Self::IntoIter { + BasicDecimalIter::<'a, BYTE_WIDTH>::new(self) + } } -def_decimal_array!( - Decimal128Array, - "Decimal128Array", - Decimal128, - Decimal128Iter -); -def_decimal_array!( - Decimal256Array, - "Decimal256Array", - Decimal256, - Decimal256Iter -); +impl<'a, const BYTE_WIDTH: usize> BasicDecimalArray { + /// constructs a new iterator + pub fn iter(&'a self) -> BasicDecimalIter<'a, BYTE_WIDTH> { + BasicDecimalIter::<'a, BYTE_WIDTH>::new(self) + } +} #[cfg(test)] mod tests { diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index 4f7d5f9c147b..2acb51750d17 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::{ArrayAccessor, ArrayIter}; use std::any::Any; use std::fmt; use std::iter::IntoIterator; @@ -234,6 +235,28 @@ impl DictionaryArray { .expect("Dictionary index not usize") }) } + + /// Downcast this dictionary to a [`TypedDictionaryArray`] + /// + /// ``` + /// use arrow::array::{Array, ArrayAccessor, DictionaryArray, StringArray}; + /// use arrow::datatypes::Int32Type; + /// + /// let orig = [Some("a"), Some("b"), None]; + /// let dictionary = DictionaryArray::::from_iter(orig); + /// let typed = dictionary.downcast_dict::().unwrap(); + /// assert_eq!(typed.value(0), "a"); + /// assert_eq!(typed.value(1), "b"); + /// assert!(typed.is_null(2)); + /// ``` + /// + pub fn downcast_dict(&self) -> Option> { + let values = self.values.as_any().downcast_ref()?; + Some(TypedDictionaryArray { + dictionary: self, + values, + }) + } } /// Constructs a `DictionaryArray` from an array data reference. @@ -302,9 +325,7 @@ impl From> for ArrayData { /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator> - for DictionaryArray -{ +impl<'a, T: ArrowDictionaryKeyType> FromIterator> for DictionaryArray { fn from_iter>>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -342,9 +363,7 @@ impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator FromIterator<&'a str> - for DictionaryArray -{ +impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray { fn from_iter>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -385,6 +404,111 @@ impl fmt::Debug for DictionaryArray { } } +/// A strongly-typed wrapper around a [`DictionaryArray`] that implements [`ArrayAccessor`] +/// allowing fast access to its elements +/// +/// ``` +/// use arrow::array::{ArrayIter, DictionaryArray, StringArray}; +/// use arrow::datatypes::Int32Type; +/// +/// let orig = ["a", "b", "a", "b"]; +/// let dictionary = DictionaryArray::::from_iter(orig); +/// +/// // `TypedDictionaryArray` allows you to access the values directly +/// let typed = dictionary.downcast_dict::().unwrap(); +/// +/// for (maybe_val, orig) in typed.into_iter().zip(orig) { +/// assert_eq!(maybe_val.unwrap(), orig) +/// } +/// ``` +pub struct TypedDictionaryArray<'a, K: ArrowPrimitiveType, V> { + /// The dictionary array + dictionary: &'a DictionaryArray, + /// The values of the dictionary + values: &'a V, +} + +// Manually implement `Clone` to avoid `V: Clone` type constraint +impl<'a, K: ArrowPrimitiveType, V> Clone for TypedDictionaryArray<'a, K, V> { + fn clone(&self) -> Self { + Self { + dictionary: self.dictionary, + values: self.values, + } + } +} + +impl<'a, K: ArrowPrimitiveType, V> Copy for TypedDictionaryArray<'a, K, V> {} + +impl<'a, K: ArrowPrimitiveType, V> fmt::Debug for TypedDictionaryArray<'a, K, V> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) + } +} + +impl<'a, K: ArrowPrimitiveType, V> TypedDictionaryArray<'a, K, V> { + /// Returns the keys of this [`TypedDictionaryArray`] + pub fn keys(&self) -> &'a PrimitiveArray { + self.dictionary.keys() + } + + /// Returns the values of this [`TypedDictionaryArray`] + pub fn values(&self) -> &'a V { + self.values + } +} + +impl<'a, K: ArrowPrimitiveType, V: Sync> Array for TypedDictionaryArray<'a, K, V> { + fn as_any(&self) -> &dyn Any { + self.dictionary + } + + fn data(&self) -> &ArrayData { + &self.dictionary.data + } + + fn into_data(self) -> ArrayData { + self.dictionary.into_data() + } +} + +impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> +where + K: ArrowPrimitiveType, + V: Sync + Send, + &'a V: ArrayAccessor, +{ + type Item = Option<::Item>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} + +impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V> +where + K: ArrowPrimitiveType, + V: Sync + Send, + &'a V: ArrayAccessor, +{ + type Item = <&'a V as ArrayAccessor>::Item; + + fn value(&self, index: usize) -> Self::Item { + assert!(self.dictionary.is_valid(index), "{}", index); + let value_idx = self.dictionary.keys.value(index).to_usize().unwrap(); + // Dictionary indexes should be valid + unsafe { self.values.value_unchecked(value_idx) } + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + let val = self.dictionary.keys.value_unchecked(index); + let value_idx = val.to_usize().unwrap(); + // Dictionary indexes should be valid + self.values.value_unchecked(value_idx) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index 543cd1fec810..b9c05014c3f7 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -34,14 +34,17 @@ use crate::{ /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { const IS_LARGE: bool; + const PREFIX: &'static str; } impl OffsetSizeTrait for i32 { const IS_LARGE: bool = false; + const PREFIX: &'static str = ""; } impl OffsetSizeTrait for i64 { const IS_LARGE: bool = true; + const PREFIX: &'static str = "Large"; } /// Generic struct for a variable-size list array. @@ -57,6 +60,16 @@ pub struct GenericListArray { } impl GenericListArray { + /// The data type constructor of list array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], List or LargeList. + pub const DATA_TYPE_CONSTRUCTOR: fn(Box) -> DataType = if OffsetSize::IS_LARGE + { + DataType::LargeList + } else { + DataType::List + }; + /// Returns a reference to the values of this list. pub fn values(&self) -> ArrayRef { self.values.clone() @@ -170,11 +183,7 @@ impl GenericListArray { .collect(); let field = Box::new(Field::new("item", T::DATA_TYPE, true)); - let data_type = if OffsetSize::IS_LARGE { - DataType::LargeList(field) - } else { - DataType::List(field) - }; + let data_type = Self::DATA_TYPE_CONSTRUCTOR(field); let array_data = ArrayData::builder(data_type) .len(null_buf.len()) .add_buffer(offsets.into()) @@ -274,7 +283,7 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray fmt::Debug for GenericListArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}ListArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index eb731a2b2f1e..a10104d980e1 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -549,6 +549,18 @@ impl PrimitiveArray { let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } + + /// Construct a timestamp array with new timezone + pub fn with_timezone(&self, timezone: String) -> Self { + let array_data = unsafe { + self.data + .clone() + .into_builder() + .data_type(DataType::Timestamp(T::get_time_unit(), Some(timezone))) + .build_unchecked() + }; + PrimitiveArray::from(array_data) + } } impl PrimitiveArray { @@ -1099,4 +1111,21 @@ mod tests { BooleanArray::from(vec![true, true, true, true, true]) ); } + + #[cfg(feature = "chrono-tz")] + #[test] + fn test_with_timezone() { + use crate::compute::hour; + let a: TimestampMicrosecondArray = vec![37800000000, 86339000000].into(); + + let b = hour(&a).unwrap(); + assert_eq!(10, b.value(0)); + assert_eq!(23, b.value(1)); + + let a = a.with_timezone(String::from("America/Los_Angeles")); + + let b = hour(&a).unwrap(); + assert_eq!(2, b.value(0)); + assert_eq!(15, b.value(1)); + } } diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 1bb99fce7eda..b72152cc4acd 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -294,7 +294,7 @@ impl<'a, T: OffsetSizeTrait> GenericStringArray { impl fmt::Debug for GenericStringArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}StringArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { @@ -707,11 +707,9 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref(&[0b101]); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt8, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); // [None, Some("Parquet")] let array_data = ArrayData::builder(data_type) @@ -754,11 +752,9 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt8, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -792,11 +788,9 @@ mod tests { .unwrap(); let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); - let data_type = if O::IS_LARGE { - DataType::LargeList - } else { - DataType::List - }(Box::new(Field::new("item", DataType::UInt16, false))); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt16, false), + )); let array_data = ArrayData::builder(data_type) .len(2) diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index 22c1490e86f3..5527679c3b7d 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -15,19 +15,20 @@ // specific language governing permissions and limitations // under the License. -use num::BigInt; use std::any::Any; use std::sync::Arc; -use crate::array::array_decimal::{BasicDecimalArray, Decimal256Array}; +use crate::array::array_decimal::Decimal256Array; use crate::array::ArrayRef; use crate::array::Decimal128Array; use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder}; use crate::error::{ArrowError, Result}; -use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision}; -use crate::util::decimal::{BasicDecimal, Decimal256}; +use crate::datatypes::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, +}; +use crate::util::decimal::Decimal256; /// Array Builder for [`Decimal128Array`] /// @@ -201,9 +202,7 @@ impl Decimal256Builder { pub fn append_value(&mut self, value: &Decimal256) -> Result<()> { let value = if self.value_validation { let raw_bytes = value.raw_value(); - let integer = BigInt::from_signed_bytes_le(raw_bytes); - let value_str = integer.to_string(); - validate_decimal256_precision(&value_str, self.precision)?; + validate_decimal256_precision_with_lt_bytes(raw_bytes, self.precision)?; value } else { value @@ -256,9 +255,9 @@ impl Decimal256Builder { #[cfg(test)] mod tests { use super::*; - use num::Num; + use num::{BigInt, Num}; - use crate::array::array_decimal::{BasicDecimalArray, Decimal128Array}; + use crate::array::array_decimal::Decimal128Array; use crate::array::{array_decimal, Array}; use crate::datatypes::DataType; use crate::util::decimal::{Decimal128, Decimal256}; @@ -305,21 +304,21 @@ mod tests { fn test_decimal256_builder() { let mut builder = Decimal256Builder::new(30, 40, 6); - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); builder.append_null(); - bytes = vec![255; 32]; - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + bytes = [255; 32]; + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); - bytes = vec![0; 32]; + bytes = [0; 32]; bytes[0..16].clone_from_slice(&0_i128.to_le_bytes()); bytes[15] = 128; - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); builder.append_option(None::<&Decimal256>).unwrap(); @@ -349,9 +348,9 @@ mod tests { fn test_decimal256_builder_unmatched_precision_scale() { let mut builder = Decimal256Builder::new(30, 10, 6); - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); } diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs index 911182f6571d..686156df13bc 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -22,7 +22,6 @@ use crate::array::ArrayData; use crate::array::ArrayRef; use crate::array::GenericListArray; use crate::array::OffsetSizeTrait; -use crate::datatypes::DataType; use crate::datatypes::Field; use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; @@ -135,11 +134,7 @@ where values_data.data_type().clone(), true, // TODO: find a consistent way of getting this )); - let data_type = if OffsetSize::IS_LARGE { - DataType::LargeList(field) - } else { - DataType::List(field) - }; + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); let array_data_builder = ArrayData::builder(data_type) .len(len) .add_buffer(offset_buffer) @@ -163,6 +158,7 @@ mod tests { use crate::array::builder::ListBuilder; use crate::array::{Array, Int32Array, Int32Builder}; use crate::buffer::Buffer; + use crate::datatypes::DataType; fn _test_generic_list_array_builder() { let values_builder = Int32Builder::new(10); diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs index cfbda38c0b28..15a36a64c14e 100644 --- a/arrow/src/array/builder/string_dictionary_builder.rs +++ b/arrow/src/array/builder/string_dictionary_builder.rs @@ -137,7 +137,7 @@ where for (idx, maybe_value) in dictionary_values.iter().enumerate() { match maybe_value { Some(value) => { - let hash = compute_hash(&state, value.as_bytes()); + let hash = state.hash_one(value.as_bytes()); let key = K::Native::from_usize(idx) .ok_or(ArrowError::DictionaryKeyOverflowError)?; @@ -149,7 +149,7 @@ where if let RawEntryMut::Vacant(v) = entry { v.insert_with_hasher(hash, key, (), |key| { - compute_hash(&state, get_bytes(&values_builder, key)) + state.hash_one(get_bytes(&values_builder, key)) }); } @@ -217,7 +217,7 @@ where let state = &self.state; let storage = &mut self.values_builder; - let hash = compute_hash(state, value.as_bytes()); + let hash = state.hash_one(value.as_bytes()); let entry = self .dedup @@ -234,7 +234,7 @@ where *entry .insert_with_hasher(hash, key, (), |key| { - compute_hash(state, get_bytes(storage, key)) + state.hash_one(get_bytes(storage, key)) }) .0 } @@ -268,13 +268,6 @@ where } } -fn compute_hash(hasher: &ahash::RandomState, value: &[u8]) -> u64 { - use std::hash::{BuildHasher, Hash, Hasher}; - let mut state = hasher.build_hasher(); - value.hash(&mut state); - state.finish() -} - fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] { let offsets = values.offsets_slice(); let values = values.values_slice(); diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 43c43b04a514..37581b93fde2 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -19,8 +19,8 @@ //! common attributes and operations for Arrow array. use crate::datatypes::{ - validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit, - UnionMode, + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DataType, + IntervalUnit, UnionMode, }; use crate::error::{ArrowError, Result}; use crate::util::bit_iterator::BitSliceIterator; @@ -30,7 +30,6 @@ use crate::{ util::bit_util, }; use half::f16; -use num::BigInt; use std::convert::TryInto; use std::mem; use std::ops::Range; @@ -396,18 +395,24 @@ impl ArrayData { /// panic's if the new DataType is not compatible with the /// existing type. /// - /// Note: currently only changing a [DataType::Decimal128]s precision - /// and scale are supported + /// Note: currently only changing a [DataType::Decimal128]s or + /// [DataType::Decimal256]s precision and scale are supported #[inline] pub(crate) fn with_data_type(mut self, new_data_type: DataType) -> Self { - assert!( - matches!(self.data_type, DataType::Decimal128(_, _)), - "only DecimalType is supported for existing type" - ); - assert!( - matches!(new_data_type, DataType::Decimal128(_, _)), - "only DecimalType is supported for new datatype" - ); + if matches!(self.data_type, DataType::Decimal128(_, _)) { + assert!( + matches!(new_data_type, DataType::Decimal128(_, _)), + "only 128-bit DecimalType is supported for new datatype" + ); + } else if matches!(self.data_type, DataType::Decimal256(_, _)) { + assert!( + matches!(new_data_type, DataType::Decimal256(_, _)), + "only 256-bit DecimalType is supported for new datatype" + ); + } else { + panic!("only DecimalType is supported.") + } + self.data_type = new_data_type; self } @@ -1043,9 +1048,7 @@ impl ArrayData { for pos in 0..self.len() { let offset = pos * 32; let raw_bytes = &values[offset..offset + 32]; - let integer = BigInt::from_signed_bytes_le(raw_bytes); - let value_str = integer.to_string(); - validate_decimal256_precision(&value_str, *p)?; + validate_decimal256_precision_with_lt_bytes(raw_bytes, *p)?; } Ok(()) } diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 3387e2842264..6fdc06f837c0 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -262,7 +262,6 @@ mod tests { use std::convert::TryFrom; use std::sync::Arc; - use crate::array::BasicDecimalArray; use crate::array::{ array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs deleted file mode 100644 index e7d14aae81a8..000000000000 --- a/arrow/src/array/equal_json.rs +++ /dev/null @@ -1,1170 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::*; -use crate::array::BasicDecimalArray; -use crate::datatypes::*; -use crate::util::decimal::BasicDecimal; -use array::Array; -use hex::FromHex; -use serde_json::value::Value::{Null as JNull, Object, String as JString}; -use serde_json::Value; - -/// Trait for comparing arrow array with json array -pub trait JsonEqual { - /// Checks whether arrow array equals to json array. - fn equals_json(&self, json: &[&Value]) -> bool; - - /// Checks whether arrow array equals to json array. - fn equals_json_values(&self, json: &[Value]) -> bool { - let refs = json.iter().collect::>(); - - self.equals_json(&refs) - } -} - -impl<'a, T: JsonEqual> JsonEqual for &'a T { - fn equals_json(&self, json: &[&Value]) -> bool { - T::equals_json(self, json) - } - - fn equals_json_values(&self, json: &[Value]) -> bool { - T::equals_json_values(self, json) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for PrimitiveArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for BooleanArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(array) => self.equals_json_values(array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &PrimitiveArray) -> bool { - match self { - Value::Array(array) => arrow.equals_json_values(array), - _ => false, - } - } -} - -impl JsonEqual for GenericListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length(i).is_zero(), - _ => false, - }) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for DictionaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - // todo: this is wrong: we must test the values also - self.keys().equals_json(json) - } -} - -impl PartialEq for DictionaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &DictionaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length() == 0, - _ => false, - }) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for StructArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - let all_object = json.iter().all(|v| matches!(v, Object(_) | JNull)); - - if !all_object { - return false; - } - - for column_name in self.column_names() { - let json_values = json - .iter() - .map(|obj| obj.get(column_name).unwrap_or(&Value::Null)) - .collect::>(); - - if !self - .column_by_name(column_name) - .map(|arr| arr.equals_json(&json_values)) - .unwrap_or(false) - { - return false; - } - } - - true - } -} - -impl PartialEq for StructArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &StructArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for MapArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length(i).eq(&0), - _ => false, - }) - } -} - -impl PartialEq for MapArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &MapArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for GenericBinaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericStringArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => self.is_valid(i) && s.as_str() == self.value(i), - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for GenericStringArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericStringArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for Decimal128Array { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - self.is_valid(i) - && (s - .parse::() - .map_or_else(|_| false, |v| v == self.value(i).as_i128())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl JsonEqual for Decimal256Array { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => self.is_valid(i) && (s == &self.value(i).to_string()), - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for Decimal128Array { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &Decimal128Array) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for UnionArray { - fn equals_json(&self, _json: &[&Value]) -> bool { - unimplemented!( - "Added to allow UnionArray to implement the Array trait: see ARROW-8547" - ) - } -} - -impl JsonEqual for NullArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - // all JSON values must be nulls - json.iter().all(|&v| v == &JNull) - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &NullArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for NullArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for ArrayRef { - fn equals_json(&self, json: &[&Value]) -> bool { - self.as_ref().equals_json(json) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::error::Result; - use std::{convert::TryFrom, sync::Arc}; - - fn create_list_array, T: AsRef<[Option]>>( - builder: &mut ListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - builder.append(false); - } - } - Ok(builder.finish()) - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - builder: &mut FixedSizeListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - for _ in 0..builder.value_length() { - builder.values().append_null(); - } - builder.append(false); - } - } - Ok(builder.finish()) - } - - #[test] - fn test_primitive_json_equal() { - // Test equaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, null, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not json array type case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_list_json_equal() { - // Test equal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_list_json_equal() { - // Test equal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - println!("{:?}", arrow_array); - println!("{:?}", json_array); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8, 9], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_string_json_equal() { - // Test the equal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_binary_json_equal() { - // Test the equal case - let mut builder = BinaryBuilder::new(6); - builder.append_value(b"hello"); - builder.append_null(); - builder.append_null(); - builder.append_value(b"world"); - builder.append_null(); - builder.append_null(); - let arrow_array = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_binary_json_equal() { - // Test the equal case - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "arrow" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_decimal_json_equal() { - // Test the equal case - let arrow_array = [Some(1_000), None, Some(-250)] - .iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = [Some(1_000), None, Some(55)] - .iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - null, - "55" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_struct_json_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let arrow_array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - }, - { - "f1": "doe", - "f2": 5 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "f1": "joe", - "f2": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not all object case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - 2, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_null_json_equal() { - // Test equaled array - let arrow_array = NullArray::new(4); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = NullArray::new(2); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } -} diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 12d6f440b78d..72030f900a4e 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -25,7 +25,7 @@ use crate::{ ffi::ArrowArrayRef, }; -use super::ArrayData; +use super::{make_array, ArrayData, ArrayRef}; impl TryFrom for ArrayData { type Error = ArrowError; @@ -39,10 +39,46 @@ impl TryFrom for ffi::ArrowArray { type Error = ArrowError; fn try_from(value: ArrayData) -> Result { - unsafe { ffi::ArrowArray::try_new(value) } + ffi::ArrowArray::try_new(value) } } +/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface +/// # Safety +/// Assumes that these pointers represent valid C Data Interfaces, both in memory +/// representation and lifetime via the `release` mechanism. +pub unsafe fn make_array_from_raw( + array: *const ffi::FFI_ArrowArray, + schema: *const ffi::FFI_ArrowSchema, +) -> Result { + let array = ffi::ArrowArray::try_from_raw(array, schema)?; + let data = ArrayData::try_from(array)?; + Ok(make_array(data)) +} + +/// Exports an array to raw pointers of the C Data Interface provided by the consumer. +/// # Safety +/// Assumes that these pointers represent valid C Data Interfaces, both in memory +/// representation and lifetime via the `release` mechanism. +/// +/// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and +/// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. +/// Usually the raw pointers are provided by the array data consumer. +pub unsafe fn export_array_into_raw( + src: ArrayRef, + out_array: *mut ffi::FFI_ArrowArray, + out_schema: *mut ffi::FFI_ArrowSchema, +) -> Result<()> { + let data = src.data(); + let array = ffi::FFI_ArrowArray::new(data); + let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?; + + std::ptr::write_unaligned(out_array, array); + std::ptr::write_unaligned(out_schema, schema); + + Ok(()) +} + #[cfg(test)] mod tests { use crate::array::{DictionaryArray, FixedSizeListArray, Int32Array, StringArray}; diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 8ee9f25447d3..7cc9bde6b4c5 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -16,7 +16,7 @@ // under the License. use crate::array::array::ArrayAccessor; -use crate::array::{BasicDecimalArray, Decimal256Array}; +use crate::array::BasicDecimalArray; use super::{ Array, BooleanArray, Decimal128Array, GenericBinaryArray, GenericListArray, @@ -104,14 +104,15 @@ pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray>; pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +pub type BasicDecimalIter<'a, const BYTE_WIDTH: usize> = + ArrayIter<&'a BasicDecimalArray>; /// an iterator that returns `Some(Decimal128)` or `None`, that can be used on a /// [`Decimal128Array`] -pub type Decimal128Iter<'a> = ArrayIter<&'a Decimal128Array>; +pub type Decimal128Iter<'a> = BasicDecimalIter<'a, 16>; /// an iterator that returns `Some(Decimal256)` or `None`, that can be used on a -/// [`Decimal256Array`] -pub type Decimal256Iter<'a> = ArrayIter<&'a Decimal256Array>; - +/// [`super::Decimal256Array`] +pub type Decimal256Iter<'a> = BasicDecimalIter<'a, 32>; /// an iterator that returns `Some(i128)` or `None`, that can be used on a /// [`Decimal128Array`] #[derive(Debug)] diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 2050bf959902..4a7667741597 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -175,7 +175,7 @@ mod builder; mod cast; mod data; mod equal; -mod equal_json; +#[cfg(feature = "ffi")] mod ffi; mod iterator; mod null; @@ -208,7 +208,7 @@ pub use self::array_fixed_size_list::FixedSizeListArray; #[deprecated(note = "Please use `Decimal128Array` instead")] pub type DecimalArray = Decimal128Array; -pub use self::array_dictionary::DictionaryArray; +pub use self::array_dictionary::{DictionaryArray, TypedDictionaryArray}; pub use self::array_list::LargeListArray; pub use self::array_list::ListArray; pub use self::array_map::MapArray; @@ -596,10 +596,6 @@ pub use self::transform::{Capacities, MutableArrayData}; pub use self::iterator::*; -// --------------------- Array Equality --------------------- - -pub use self::equal_json::JsonEqual; - // --------------------- Array's values comparison --------------------- pub use self::ord::{build_compare, DynComparator}; @@ -615,7 +611,8 @@ pub use self::cast::{ // ------------------------------ C Data Interface --------------------------- -pub use self::array::{export_array_into_raw, make_array_from_raw}; +#[cfg(feature = "ffi")] +pub use self::ffi::{export_array_into_raw, make_array_from_raw}; #[cfg(test)] mod tests { diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index 1e19c7cc2fca..47173aa7d927 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -19,7 +19,6 @@ use std::cmp::Ordering; -use crate::array::BasicDecimalArray; use crate::array::*; use crate::datatypes::TimeUnit; use crate::datatypes::*; diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 564ef444a1dd..f0fccef14fd7 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -313,11 +313,7 @@ fn preallocate_offset_and_binary_buffer( // offsets let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); // safety: `unsafe` code assumes that this buffer is initialized with one element - if Offset::IS_LARGE { - buffer.push(0i64); - } else { - buffer.push(0i32) - } + buffer.push(Offset::zero()); [ buffer, @@ -674,8 +670,6 @@ mod tests { use std::{convert::TryFrom, sync::Arc}; use super::*; - - use crate::array::BasicDecimalArray; use crate::array::Decimal128Array; use crate::{ array::{ diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs index cb686bd8441c..8ec5a4554208 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow/src/buffer/immutable.rs @@ -22,7 +22,6 @@ use std::sync::Arc; use std::{convert::AsRef, usize}; use crate::alloc::{Allocation, Deallocation}; -use crate::ffi::FFI_ArrowArray; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::{bytes::Bytes, datatypes::ArrowNativeType}; @@ -77,30 +76,6 @@ impl Buffer { Buffer::build_with_arguments(ptr, len, Deallocation::Arrow(capacity)) } - /// Creates a buffer from an existing memory region (must already be byte-aligned), this - /// `Buffer` **does not** free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `data` - An [crate::ffi::FFI_ArrowArray] with the data - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes and that the foreign deallocator frees the region. - #[deprecated( - note = "use from_custom_allocation instead which makes it clearer that the allocation is in fact owned" - )] - pub unsafe fn from_unowned( - ptr: NonNull, - len: usize, - data: Arc, - ) -> Self { - Self::from_custom_allocation(ptr, len, data) - } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. /// diff --git a/arrow/src/compute/README.md b/arrow/src/compute/README.md index 761713a531b4..a5d15a83046f 100644 --- a/arrow/src/compute/README.md +++ b/arrow/src/compute/README.md @@ -33,16 +33,16 @@ We use the term "kernel" to refer to particular general operation that contains Types of functions -* Scalar functions: elementwise functions that perform scalar operations in a +- Scalar functions: elementwise functions that perform scalar operations in a vectorized manner. These functions are generally valid for SQL-like context. These are called "scalar" in that the functions executed consider each value in an array independently, and the output array or arrays have the same length as the input arrays. The result for each array cell is generally independent of its position in the array. -* Vector functions, which produce a result whose output is generally dependent +- Vector functions, which produce a result whose output is generally dependent on the entire contents of the input arrays. These functions **are generally not valid** for SQL-like processing because the output size may be different than the input size, and the result may change based on the order of the values in the array. This includes things like array subselection, sorting, hashing, and more. -* Scalar aggregate functions of which can be used in a SQL-like context \ No newline at end of file +- Scalar aggregate functions of which can be used in a SQL-like context diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index c6b8f477986f..ddca0c2e9351 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -36,6 +36,7 @@ //! ``` use chrono::Timelike; +use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; @@ -53,7 +54,7 @@ use crate::temporal_conversions::{ use crate::{array::*, compute::take}; use crate::{buffer::Buffer, util::serialization::lexical_to_string}; use num::cast::AsPrimitive; -use num::{NumCast, ToPrimitive}; +use num::{BigInt, NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors #[derive(Debug)] @@ -78,6 +79,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { // TODO UTF8/unsigned numeric to decimal // cast one decimal type to another decimal type (Decimal128(_, _), Decimal128(_, _)) => true, + (Decimal256(_, _), Decimal256(_, _)) => true, + (Decimal128(_, _), Decimal256(_, _)) => true, + (Decimal256(_, _), Decimal128(_, _)) => true, // signed numeric to decimal (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | // decimal to signed numeric @@ -432,7 +436,16 @@ pub fn cast_with_options( } match (from_type, to_type) { (Decimal128(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal(array, s1, p2, s2) + cast_decimal_to_decimal::<16, 16>(array, s1, p2, s2) + } + (Decimal256(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::<32, 32>(array, s1, p2, s2) + } + (Decimal128(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::<16, 32>(array, s1, p2, s2) + } + (Decimal256(_, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal::<32, 16>(array, s1, p2, s2) } (Decimal128(_, scale), _) => { // cast decimal to other type @@ -1252,34 +1265,123 @@ const fn time_unit_multiple(unit: &TimeUnit) -> i64 { } /// Cast one type of decimal array to another type of decimal array -fn cast_decimal_to_decimal( +fn cast_decimal_to_decimal( array: &ArrayRef, input_scale: &usize, output_precision: &usize, output_scale: &usize, ) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - let output_array = if input_scale > output_scale { + if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; // Original value is 11234_i128, and will be cast to 1123_i128. let div = 10_i128.pow((input_scale - output_scale) as u32); - array - .iter() - .map(|v| v.map(|v| v.as_i128() / div)) - .collect::() + if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.as_i128() / div)); + if BYTE_WIDTH2 == 16 { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .map(|v| v.map(BigInt::from)) + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.to_big_int().div(div))); + if BYTE_WIDTH2 == 16 { + let values = iter + .map(|v| { + if v.is_none() { + Ok(None) + } else { + v.as_ref().and_then(|v| v.to_i128()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + }) + .map(Some) + } + }) + .collect::>>()?; + + let output_array = values + .into_iter() + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } } else { // For example, input_scale is 3 and output_scale is 4; // Original value is 1123_i128, and will be cast to 11230_i128. let mul = 10_i128.pow((output_scale - input_scale) as u32); - array - .iter() - .map(|v| v.map(|v| v.as_i128() * mul)) - .collect::() - } - .with_precision_and_scale(*output_precision, *output_scale)?; + if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.as_i128() * mul)); + if BYTE_WIDTH2 == 16 { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .map(|v| v.map(BigInt::from)) + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; - Ok(Arc::new(output_array)) + Ok(Arc::new(output_array)) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.to_big_int().mul(mul))); + if BYTE_WIDTH2 == 16 { + let values = iter + .map(|v| { + if v.is_none() { + Ok(None) + } else { + v.as_ref().and_then(|v| v.to_i128()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + }) + .map(Some) + } + }) + .collect::>>()?; + + let output_array = values + .into_iter() + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } + } } /// Cast an array by changing its array_data type to the desired type @@ -2420,9 +2522,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::BasicDecimalArray; use crate::datatypes::TimeUnit; - use crate::util::decimal::Decimal128; + use crate::util::decimal::{Decimal128, Decimal256}; use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { @@ -2461,8 +2562,19 @@ mod tests { .with_precision_and_scale(precision, scale) } + fn create_decimal256_array( + array: Vec>, + precision: usize, + scale: usize, + ) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + } + #[test] - fn test_cast_decimal_to_decimal() { + fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); let output_type = DataType::Decimal128(20, 4); assert!(can_cast_types(&input_type, &output_type)); @@ -2490,6 +2602,97 @@ mod tests { result.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal128_to_decimal256() { + let input_type = DataType::Decimal128(20, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let input_decimal_array = create_decimal_array(&array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some( + Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) + .unwrap() + ), + None + ] + ); + } + + #[test] + fn test_cast_decimal256_to_decimal128() { + let input_type = DataType::Decimal256(20, 3); + let output_type = DataType::Decimal128(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![ + Some(BigInt::from(1123456)), + Some(BigInt::from(2123456)), + Some(BigInt::from(3123456)), + None, + ]; + let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), + None + ] + ); + } + + #[test] + fn test_cast_decimal256_to_decimal256() { + let input_type = DataType::Decimal256(20, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![ + Some(BigInt::from(1123456)), + Some(BigInt::from(2123456)), + Some(BigInt::from(3123456)), + None, + ]; + let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some( + Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) + .unwrap() + ), + None + ] + ); + } + #[test] fn test_cast_decimal_to_numeric() { let decimal_type = DataType::Decimal128(38, 2); diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index e4187ef87155..1d0bc938ece9 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -987,18 +987,19 @@ pub fn gt_eq_utf8_scalar( compare_op_scalar(left, |a| a >= right) } +// Avoids creating a closure for each combination of `$RIGHT` and `$TY` +fn try_to_type_result(value: Option, right: &str, ty: &str) -> Result { + value.ok_or_else(|| { + ArrowError::ComputeError(format!("Could not convert {} with {}", right, ty,)) + }) +} + /// Calls $RIGHT.$TY() (e.g. `right.to_i128()`) with a nice error message. /// Type of expression is `Result<.., ArrowError>` macro_rules! try_to_type { - ($RIGHT: expr, $TY: ident) => {{ - $RIGHT.$TY().ok_or_else(|| { - ArrowError::ComputeError(format!( - "Could not convert {} with {}", - stringify!($RIGHT), - stringify!($TY) - )) - }) - }}; + ($RIGHT: expr, $TY: ident) => { + try_to_type_result($RIGHT.$TY(), stringify!($RIGHT), stringify!($TYPE)) + }; } macro_rules! dyn_compare_scalar { @@ -1068,59 +1069,35 @@ macro_rules! dyn_compare_scalar { match $KT.as_ref() { DataType::UInt8 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt16 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt32 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt64 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int8 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int16 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int32 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int64 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } _ => Err(ArrowError::ComputeError(format!( "Unsupported dictionary key type {:?}", @@ -1186,7 +1163,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, eq_scalar) + dyn_compare_scalar!(left, right, key_type, eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, eq_scalar), } @@ -1200,7 +1177,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_scalar) + dyn_compare_scalar!(left, right, key_type, lt_dyn_scalar) } _ => dyn_compare_scalar!(left, right, lt_scalar), } @@ -1214,7 +1191,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_eq_scalar) + dyn_compare_scalar!(left, right, key_type, lt_eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, lt_eq_scalar), } @@ -1228,7 +1205,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_scalar) + dyn_compare_scalar!(left, right, key_type, gt_dyn_scalar) } _ => dyn_compare_scalar!(left, right, gt_scalar), } @@ -1242,7 +1219,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_eq_scalar) + dyn_compare_scalar!(left, right, key_type, gt_eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, gt_eq_scalar), } @@ -1256,7 +1233,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, neq_scalar) + dyn_compare_scalar!(left, right, key_type, neq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, neq_scalar), } diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 0a3d0541ce3c..dca09a66a8cf 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -17,7 +17,6 @@ //! Defines sort kernel for `ArrayRef` -use crate::array::BasicDecimalArray; use crate::array::*; use crate::buffer::MutableBuffer; use crate::compute::take; diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index ab99acd2c04b..fb8f75651882 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -19,8 +19,6 @@ use std::{ops::AddAssign, sync::Arc}; -use crate::array::BasicDecimalArray; - use crate::buffer::{Buffer, MutableBuffer}; use crate::compute::util::{ take_value_indices_from_fixed_size_list, take_value_indices_from_list, @@ -614,23 +612,41 @@ where let mut output_buffer = MutableBuffer::new_null(len); let output_slice = output_buffer.as_slice_mut(); - indices - .iter() - .enumerate() - .try_for_each::<_, Result<()>>(|(i, index)| { - if let Some(index) = index { - let index = ToPrimitive::to_usize(&index).ok_or_else(|| { + let indices_has_nulls = indices.null_count() > 0; + + if indices_has_nulls { + indices + .iter() + .enumerate() + .try_for_each::<_, Result<()>>(|(i, index)| { + if let Some(index) = index { + let index = ToPrimitive::to_usize(&index).ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + + if bit_util::get_bit(values_slice, values_offset + index) { + bit_util::set_bit(output_slice, i); + } + } + + Ok(()) + })?; + } else { + indices + .values() + .iter() + .enumerate() + .try_for_each::<_, Result<()>>(|(i, index)| { + let index = ToPrimitive::to_usize(index).ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; if bit_util::get_bit(values_slice, values_offset + index) { bit_util::set_bit(output_slice, i); } - } - - Ok(()) - })?; - + Ok(()) + })?; + } Ok(output_buffer.into()) } diff --git a/arrow/src/compute/kernels/zip.rs b/arrow/src/compute/kernels/zip.rs index 0ee8e47bede0..c28529cf6762 100644 --- a/arrow/src/compute/kernels/zip.rs +++ b/arrow/src/compute/kernels/zip.rs @@ -44,7 +44,7 @@ pub fn zip( let falsy = falsy.data(); let truthy = truthy.data(); - let mut mutable = MutableArrayData::new(vec![&*truthy, &*falsy], false, truthy.len()); + let mut mutable = MutableArrayData::new(vec![truthy, falsy], false, truthy.len()); // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to // fill with falsy values diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 29a90b65c237..974af9593e36 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -351,9 +351,7 @@ pub(super) mod tests { T: ArrowPrimitiveType, PrimitiveArray: From>>, { - use std::any::TypeId; - - let mut offset = vec![0]; + let mut offset = vec![S::zero()]; let mut values = vec![]; let list_len = data.len(); @@ -367,34 +365,18 @@ pub(super) mod tests { list_null_count += 1; bit_util::unset_bit(list_bitmap.as_slice_mut(), idx); } - offset.push(values.len() as i64); + offset.push(S::from_usize(values.len()).unwrap()); } let value_data = PrimitiveArray::::from(values).into_data(); - let (list_data_type, value_offsets) = if TypeId::of::() == TypeId::of::() - { - ( - DataType::List(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref( - &offset.into_iter().map(|x| x as i32).collect::>(), - ), - ) - } else if TypeId::of::() == TypeId::of::() { - ( - DataType::LargeList(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref(&offset), - ) - } else { - unreachable!() - }; + let (list_data_type, value_offsets) = ( + GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new(Field::new( + "item", + T::DATA_TYPE, + list_null_count == 0, + ))), + Buffer::from_slice_ref(&offset), + ); let list_data = ArrayData::builder(list_data_type) .len(list_len) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 7c533a8f8b24..f01ce37c7399 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1116,7 +1116,6 @@ mod tests { use std::io::{Cursor, Write}; use tempfile::NamedTempFile; - use crate::array::BasicDecimalArray; use crate::array::*; use crate::compute::cast; use crate::datatypes::Field; diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 034920d37537..1d922c8ebe68 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. -use num::{BigInt, Num, ToPrimitive}; +use num::BigInt; +use std::cmp::Ordering; use std::fmt; use serde_derive::{Deserialize, Serialize}; use serde_json::{json, Value, Value::String as VString}; use crate::error::{ArrowError, Result}; +use crate::util::decimal::singed_cmp_le_bytes; use super::Field; @@ -263,6 +265,626 @@ impl fmt::Display for DataType { } } +// MAX decimal256 value of little-endian format for each precision. +// Each element is the max value of signed 256-bit integer for the specified precision which +// is encoded to the 32-byte width format of little-endian. +pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ + [ + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + ], + [ + 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + ], + [ + 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, + 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, + 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, + 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, + 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, + 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, + 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, + 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, + 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, + 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, + 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, + 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, + 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, + 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, + 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, + 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, + 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, + 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, + 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, + 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, + 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, + 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, + 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, + 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, + 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, + 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, + 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, + 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, + 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, + 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, + 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, + 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, + 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, + 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, + 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, + 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, + 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, + ], +]; + +// MIN decimal256 value of little-endian format for each precision. +// Each element is the min value of signed 256-bit integer for the specified precision which +// is encoded to the 76-byte width format of little-endian. +pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ + [ + 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, + 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, + 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, + 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, + 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, + 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, + 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, + 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, + 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, + 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, + 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, + 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, + 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, + 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, + 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, + 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, + 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, + 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, + 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, + 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, + 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, + 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, + 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, + 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, + 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, + 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, + 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, + 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, + 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, + 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, + 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, + ], +]; + /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value /// that can be stored in [DataType::Decimal128] value of precision `p` pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ @@ -306,49 +928,6 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 99999999999999999999999999999999999999, ]; -/// `MAX_DECIMAL_FOR_LARGER_PRECISION[p]` holds the maximum integer value -/// that can be stored in [DataType::Decimal256] value of precision `p` > 38 -pub const MAX_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ - "999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999999999999", -]; - /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value /// that can be stored in a [DataType::Decimal128] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ @@ -392,49 +971,6 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; -/// `MIN_DECIMAL_FOR_LARGER_PRECISION[p]` holds the minimum integer value -/// that can be stored in a [DataType::Decimal256] value of precision `p` > 38 -pub const MIN_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ - "-999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999999999999", -]; - /// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: usize = 38; @@ -479,52 +1015,38 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul } } -/// Validates that the specified string value can be properly -/// interpreted as a Decimal256 number with precision `precision` +/// Validates that the specified `byte_array` of little-endian format +/// value can be properly interpreted as a Decimal256 number with precision `precision` #[inline] -pub(crate) fn validate_decimal256_precision( - value: &str, +pub(crate) fn validate_decimal256_precision_with_lt_bytes( + lt_value: &[u8], precision: usize, -) -> Result { - if precision > 38 { - let max_str = MAX_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; - let min_str = MIN_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; - - let max = BigInt::from_str_radix(max_str, 10).unwrap(); - let min = BigInt::from_str_radix(min_str, 10).unwrap(); - - let value = BigInt::from_str_radix(value, 10).unwrap(); - if value > max { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal256 of precision {}. Max is {}", - value, precision, max - ))) - } else if value < min { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal256 of precision {}. Min is {}", - value, precision, min - ))) - } else { - Ok(value) - } +) -> Result<()> { + if precision > DECIMAL256_MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "Max precision of a Decimal256 is {}, but got {}", + DECIMAL256_MAX_PRECISION, precision, + ))); + } + let max = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[precision - 1]; + let min = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[precision - 1]; + + if singed_cmp_le_bytes(lt_value, &max) == Ordering::Greater { + Err(ArrowError::InvalidArgumentError(format!( + "{:?} is too large to store in a Decimal256 of precision {}. Max is {:?}", + BigInt::from_signed_bytes_le(lt_value), + precision, + BigInt::from_signed_bytes_le(&max) + ))) + } else if singed_cmp_le_bytes(lt_value, &min) == Ordering::Less { + Err(ArrowError::InvalidArgumentError(format!( + "{:?} is too small to store in a Decimal256 of precision {}. Min is {:?}", + BigInt::from_signed_bytes_le(lt_value), + precision, + BigInt::from_signed_bytes_le(&min) + ))) } else { - let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1]; - let min = MIN_DECIMAL_FOR_EACH_PRECISION[precision - 1]; - let value = BigInt::from_str_radix(value, 10).unwrap(); - - if value.to_i128().unwrap() > max { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal256 of precision {}. Max is {}", - value, precision, max - ))) - } else if value.to_i128().unwrap() < min { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal256 of precision {}. Min is {}", - value, precision, min - ))) - } else { - Ok(value) - } + Ok(()) } } @@ -941,3 +1463,32 @@ impl DataType { } } } + +#[cfg(test)] +mod test { + use crate::datatypes::datatype::{ + MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + }; + use crate::util::decimal::Decimal256; + use num::{BigInt, Num}; + + #[test] + fn test_decimal256_min_max_for_precision() { + // The precision from 1 to 76 + let mut max_value = "9".to_string(); + let mut min_value = "-9".to_string(); + for i in 1..77 { + let max_decimal = + Decimal256::from(BigInt::from_str_radix(max_value.as_str(), 10).unwrap()); + let min_decimal = + Decimal256::from(BigInt::from_str_radix(min_value.as_str(), 10).unwrap()); + let max_bytes = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; + let min_bytes = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; + max_value += "9"; + min_value += "9"; + assert_eq!(max_decimal.raw_value(), max_bytes); + assert_eq!(min_decimal.raw_value(), min_bytes); + } + } +} diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index abb80d64aaf3..f50ebadd5e7c 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -209,23 +209,17 @@ impl Field { } fn _fields<'a>(&'a self, dt: &'a DataType) -> Vec<&Field> { - let mut collected_fields = vec![]; - match dt { DataType::Struct(fields) | DataType::Union(fields, _, _) => { - collected_fields.extend(fields.iter().flat_map(|f| f.fields())) + fields.iter().flat_map(|f| f.fields()).collect() } DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => collected_fields.extend(field.fields()), - DataType::Dictionary(_, value_field) => { - collected_fields.append(&mut self._fields(value_field.as_ref())) - } - _ => (), + | DataType::Map(field, _) => field.fields(), + DataType::Dictionary(_, value_field) => self._fields(value_field.as_ref()), + _ => vec![], } - - collected_fields } /// Returns a vector containing all (potentially nested) `Field` instances selected by the @@ -506,12 +500,10 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) => vec![field.to_json()], - DataType::LargeList(field) => vec![field.to_json()], - DataType::FixedSizeList(field, _) => vec![field.to_json()], - DataType::Map(field, _) => { - vec![field.to_json()] - } + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => vec![field.to_json()], _ => vec![], }; match self.data_type() { @@ -550,6 +542,17 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<()> { + if from.dict_id != self.dict_id { + return Err(ArrowError::SchemaError( + "Fail to merge schema Field due to conflicting dict_id".to_string(), + )); + } + if from.dict_is_ordered != self.dict_is_ordered { + return Err(ArrowError::SchemaError( + "Fail to merge schema Field due to conflicting dict_is_ordered" + .to_string(), + )); + } // merge metadata match (self.metadata(), from.metadata()) { (Some(self_metadata), Some(from_metadata)) => { @@ -572,31 +575,16 @@ impl Field { } _ => {} } - if from.dict_id != self.dict_id { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_id".to_string(), - )); - } - if from.dict_is_ordered != self.dict_is_ordered { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_is_ordered" - .to_string(), - )); - } match &mut self.data_type { DataType::Struct(nested_fields) => match &from.data_type { DataType::Struct(from_nested_fields) => { for from_field in from_nested_fields { - let mut is_new_field = true; - for self_field in nested_fields.iter_mut() { - if self_field.name != from_field.name { - continue; - } - is_new_field = false; - self_field.try_merge(from_field)?; - } - if is_new_field { - nested_fields.push(from_field.clone()); + match nested_fields + .iter_mut() + .find(|self_field| self_field.name == from_field.name) + { + Some(self_field) => self_field.try_merge(from_field)?, + None => nested_fields.push(from_field.clone()), } } } @@ -685,9 +673,7 @@ impl Field { } } } - if from.nullable { - self.nullable = from.nullable; - } + self.nullable |= from.nullable; Ok(()) } @@ -698,41 +684,25 @@ impl Field { /// * self.metadata is a superset of other.metadata /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { - if self.name != other.name - || self.data_type != other.data_type - || self.dict_id != other.dict_id - || self.dict_is_ordered != other.dict_is_ordered - { - return false; - } - - if self.nullable != other.nullable && !self.nullable { - return false; - } - + self.name == other.name + && self.data_type == other.data_type + && self.dict_id == other.dict_id + && self.dict_is_ordered == other.dict_is_ordered + // self need to be nullable or both of them are not nullable + && (self.nullable || !other.nullable) // make sure self.metadata is a superset of other.metadata - match (&self.metadata, &other.metadata) { - (None, Some(_)) => { - return false; - } + && match (&self.metadata, &other.metadata) { + (_, None) => true, + (None, Some(_)) => false, (Some(self_meta), Some(other_meta)) => { - for (k, v) in other_meta.iter() { + other_meta.iter().all(|(k, v)| { match self_meta.get(k) { - Some(s) => { - if s != v { - return false; - } - } - None => { - return false; - } + Some(s) => s == v, + None => false } - } + }) } - _ => {} } - - true } } @@ -745,7 +715,7 @@ impl std::fmt::Display for Field { #[cfg(test)] mod test { - use super::{DataType, Field}; + use super::*; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; @@ -840,4 +810,72 @@ mod test { assert_ne!(dict1, dict2); assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); } + + #[test] + fn test_contains_reflexivity() { + let mut field = Field::new("field1", DataType::Float16, false); + field.set_metadata(Some(BTreeMap::from([ + (String::from("k0"), String::from("v0")), + (String::from("k1"), String::from("v1")), + ]))); + assert!(field.contains(&field)) + } + + #[test] + fn test_contains_transitivity() { + let child_field = Field::new("child1", DataType::Float16, false); + + let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); + field1.set_metadata(Some(BTreeMap::from([( + String::from("k1"), + String::from("v1"), + )]))); + + let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); + field2.set_metadata(Some(BTreeMap::from([( + String::from("k2"), + String::from("v2"), + )]))); + field2.try_merge(&field1).unwrap(); + + let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); + field3.set_metadata(Some(BTreeMap::from([( + String::from("k3"), + String::from("v3"), + )]))); + field3.try_merge(&field2).unwrap(); + + assert!(field2.contains(&field1)); + assert!(field3.contains(&field2)); + assert!(field3.contains(&field1)); + + assert!(!field1.contains(&field2)); + assert!(!field1.contains(&field3)); + assert!(!field2.contains(&field3)); + } + + #[test] + fn test_contains_nullable() { + let field1 = Field::new("field1", DataType::Boolean, true); + let field2 = Field::new("field1", DataType::Boolean, false); + assert!(field1.contains(&field2)); + assert!(!field2.contains(&field1)); + } + + #[test] + fn test_contains_must_have_same_fields() { + let child_field1 = Field::new("child1", DataType::Float16, false); + let child_field2 = Field::new("child2", DataType::Float16, false); + + let field1 = + Field::new("field1", DataType::Struct(vec![child_field1.clone()]), true); + let field2 = Field::new( + "field1", + DataType::Struct(vec![child_field1, child_field2]), + true, + ); + + assert!(!field1.contains(&field2)); + assert!(!field2.contains(&field1)); + } } diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index b035a37c8433..1f98a4afa918 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -37,8 +37,10 @@ pub use types::*; mod datatype; pub use datatype::*; mod delta; -mod ffi; +#[cfg(feature = "ffi")] +mod ffi; +#[cfg(feature = "ffi")] pub use ffi::*; /// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 4073e7183175..528f3adc2d84 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -29,14 +29,16 @@ //! # use arrow::array::{Int32Array, Array, ArrayData, export_array_into_raw, make_array, make_array_from_raw}; //! # use arrow::error::{Result, ArrowError}; //! # use arrow::compute::kernels::arithmetic; -//! # use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +//! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; //! # use std::convert::TryFrom; //! # fn main() -> Result<()> { //! // create an array natively //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); //! //! // export it -//! let (array_ptr, schema_ptr) = array.to_raw()?; +//! +//! let ffi_array = ArrowArray::try_new(array.data().clone())?; +//! let (array_ptr, schema_ptr) = ArrowArray::into_raw(ffi_array); //! //! // consumed and used by something else... //! @@ -456,7 +458,7 @@ struct ArrayPrivateData { impl FFI_ArrowArray { /// creates a new `FFI_ArrowArray` from existing data. - /// # Safety + /// # Memory Leaks /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub fn new(data: &ArrayData) -> Self { @@ -836,10 +838,11 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { impl ArrowArray { /// creates a new `ArrowArray`. This is used to export to the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - #[allow(clippy::too_many_arguments)] - pub unsafe fn try_new(data: ArrayData) -> Result { + /// + /// # Memory Leaks + /// This method releases `buffers`. Consumers of this struct *must* call `release` before + /// releasing this struct, or contents in `buffers` leak. + pub fn try_new(data: ArrayData) -> Result { let array = Arc::new(FFI_ArrowArray::new(&data)); let schema = Arc::new(FFI_ArrowSchema::try_from(data.data_type())?); Ok(ArrowArray { array, schema }) @@ -908,7 +911,6 @@ impl<'a> ArrowArrayChild<'a> { #[cfg(test)] mod tests { use super::*; - use crate::array::BasicDecimalArray; use crate::array::{ export_array_into_raw, make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, DurationSecondArray, FixedSizeBinaryArray, @@ -1031,12 +1033,9 @@ mod tests { .collect::(); // Construct a list array from the above two - let list_data_type = match std::mem::size_of::() { - 4 => DataType::List(Box::new(Field::new("item", DataType::Int32, false))), - _ => { - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))) - } - }; + let list_data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::Int32, false), + )); let list_data = ArrayData::builder(list_data_type) .len(3) diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index f3af214cee0a..ce44d74a1a1b 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -1215,7 +1215,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1336,7 +1336,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); // the next batch must be empty assert!(reader.next().is_none()); // the stream must indicate that it's finished @@ -1373,7 +1373,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1406,7 +1406,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); // the next batch must be empty assert!(reader.next().is_none()); // the stream must indicate that it's finished diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs index 374f9fad1efa..f0942b074cfe 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow/src/ipc/writer.rs @@ -1319,7 +1319,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1370,7 +1370,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1434,7 +1434,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1495,7 +1495,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index 9b348e629169..66fdc691887b 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -590,7 +590,7 @@ pub struct Decoder { options: DecoderOptions, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] /// Options for JSON decoding pub struct DecoderOptions { /// Batch size (number of records to load each time), defaults to 1024 records diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 45092ef4c54e..04f495dc0819 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -18,41 +18,8 @@ //! A complete, safe, native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language //! development platform for in-memory data. //! -//! # Performance Tips -//! -//! Arrow aims to be as fast as possible out of the box, whilst not compromising on safety. However, -//! it relies heavily on LLVM auto-vectorisation to achieve this. Unfortunately the LLVM defaults, -//! particularly for x86_64, favour portability over performance, and LLVM will consequently avoid -//! using more recent instructions that would result in errors on older CPUs. -//! -//! To address this it is recommended that you specify the override the LLVM defaults either -//! by setting the `RUSTFLAGS` environment variable, or by setting `rustflags` in your -//! [Cargo configuration](https://doc.rust-lang.org/cargo/reference/config.html) -//! -//! Enable all features supported by the current CPU -//! -//! ```ignore -//! RUSTFLAGS="-C target-cpu=native" -//! ``` -//! -//! Enable all features supported by the current CPU, and enable full use of AVX512 -//! -//! ```ignore -//! RUSTFLAGS="-C target-cpu=native -C target-feature=-prefer-256-bit" -//! ``` -//! -//! Enable all features supported by CPUs more recent than haswell (2013) -//! -//! ```ignore -//! RUSTFLAGS="-C target-cpu=haswell" -//! ``` -//! -//! For a full list of features and target CPUs use -//! -//! ```ignore -//! $ rustc --print target-cpus -//! $ rustc --print target-features -//! ``` +//! Please see the [arrow crates.io](https://crates.io/crates/arrow) +//! page for feature flags and tips to improve performance. //! //! # Columnar Format //! @@ -291,7 +258,9 @@ pub mod compute; pub mod csv; pub mod datatypes; pub mod error; +#[cfg(feature = "ffi")] pub mod ffi; +#[cfg(feature = "ffi")] pub mod ffi_stream; #[cfg(feature = "ipc")] pub mod ipc; diff --git a/arrow/src/temporal_conversions.rs b/arrow/src/temporal_conversions.rs index fda004a6daa4..12982b7dabc2 100644 --- a/arrow/src/temporal_conversions.rs +++ b/arrow/src/temporal_conversions.rs @@ -42,11 +42,13 @@ pub fn date32_to_datetime(v: i32) -> NaiveDateTime { /// converts a `i64` representing a `date64` to [`NaiveDateTime`] #[inline] pub fn date64_to_datetime(v: i64) -> NaiveDateTime { + let (sec, milli_sec) = split_second(v, MILLISECONDS); + NaiveDateTime::from_timestamp( // extract seconds from milliseconds - v / MILLISECONDS, + sec, // discard extracted seconds and convert milliseconds to nanoseconds - (v % MILLISECONDS * MICROSECONDS) as u32, + milli_sec * MICROSECONDS as u32, ) } @@ -101,36 +103,59 @@ pub fn timestamp_s_to_datetime(v: i64) -> NaiveDateTime { /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { + let (sec, milli_sec) = split_second(v, MILLISECONDS); + NaiveDateTime::from_timestamp( // extract seconds from milliseconds - v / MILLISECONDS, + sec, // discard extracted seconds and convert milliseconds to nanoseconds - (v % MILLISECONDS * MICROSECONDS) as u32, + milli_sec * MICROSECONDS as u32, ) } /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] #[inline] pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { + let (sec, micro_sec) = split_second(v, MICROSECONDS); + NaiveDateTime::from_timestamp( // extract seconds from microseconds - v / MICROSECONDS, + sec, // discard extracted seconds and convert microseconds to nanoseconds - (v % MICROSECONDS * MILLISECONDS) as u32, + micro_sec * MILLISECONDS as u32, ) } /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { + let (sec, nano_sec) = split_second(v, NANOSECONDS); + NaiveDateTime::from_timestamp( // extract seconds from nanoseconds - v / NANOSECONDS, - // discard extracted seconds - (v % NANOSECONDS) as u32, + sec, // discard extracted seconds + nano_sec, ) } +/// +#[inline] +pub(crate) fn split_second(v: i64, base: i64) -> (i64, u32) { + if v < 0 { + let v = -v; + let mut seconds = v / base; + let mut part = v % base; + + if part > 0 { + seconds += 1; + part = base - part; + } + (-seconds, part as u32) + } else { + (v / base, (v % base) as u32) + } +} + /// converts a `i64` representing a `duration(s)` to [`Duration`] #[inline] pub fn duration_s_to_duration(v: i64) -> Duration { @@ -154,3 +179,83 @@ pub fn duration_us_to_duration(v: i64) -> Duration { pub fn duration_ns_to_duration(v: i64) -> Duration { Duration::nanoseconds(v) } + +#[cfg(test)] +mod tests { + use crate::temporal_conversions::{ + date64_to_datetime, split_second, timestamp_ms_to_datetime, + timestamp_ns_to_datetime, timestamp_us_to_datetime, NANOSECONDS, + }; + use chrono::NaiveDateTime; + + #[test] + fn negative_input_timestamp_ns_to_datetime() { + assert_eq!( + timestamp_ns_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_999_999) + ); + + assert_eq!( + timestamp_ns_to_datetime(-1_000_000_001), + NaiveDateTime::from_timestamp(-2, 999_999_999) + ); + } + + #[test] + fn negative_input_timestamp_us_to_datetime() { + assert_eq!( + timestamp_us_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_999_000) + ); + + assert_eq!( + timestamp_us_to_datetime(-1_000_001), + NaiveDateTime::from_timestamp(-2, 999_999_000) + ); + } + + #[test] + fn negative_input_timestamp_ms_to_datetime() { + assert_eq!( + timestamp_ms_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_000_000) + ); + + assert_eq!( + timestamp_ms_to_datetime(-1_001), + NaiveDateTime::from_timestamp(-2, 999_000_000) + ); + } + + #[test] + fn negative_input_date64_to_datetime() { + assert_eq!( + date64_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_000_000) + ); + + assert_eq!( + date64_to_datetime(-1_001), + NaiveDateTime::from_timestamp(-2, 999_000_000) + ); + } + + #[test] + fn test_split_seconds() { + let (sec, nano_sec) = split_second(100, NANOSECONDS); + assert_eq!(sec, 0); + assert_eq!(nano_sec, 100); + + let (sec, nano_sec) = split_second(123_000_000_456, NANOSECONDS); + assert_eq!(sec, 123); + assert_eq!(nano_sec, 456); + + let (sec, nano_sec) = split_second(-1, NANOSECONDS); + assert_eq!(sec, -1); + assert_eq!(nano_sec, 999_999_999); + + let (sec, nano_sec) = split_second(-123_000_000_001, NANOSECONDS); + assert_eq!(sec, -124); + assert_eq!(nano_sec, 999_999_999); + } +} diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 21b8ee8c9fd1..4d974409a0ee 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -143,6 +143,17 @@ pub fn create_random_array( }) .collect::>>()?, )?), + d @ Dictionary(_, value_type) + if crate::compute::can_cast_types(value_type, d) => + { + let f = Field::new( + field.name(), + value_type.as_ref().clone(), + field.is_nullable(), + ); + let v = create_random_array(&f, size, null_density, true_density)?; + crate::compute::cast(&v, d)? + } other => { return Err(ArrowError::NotYetImplemented(format!( "Generating random arrays not yet implemented for {:?}", diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs index 62a950795378..96399b870207 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow/src/util/decimal.rs @@ -18,21 +18,50 @@ //! Decimal related utils use crate::datatypes::{ - DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, + DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, + DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; use crate::error::{ArrowError, Result}; use num::bigint::BigInt; use num::Signed; use std::cmp::{min, Ordering}; -pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { - /// The bit-width of the internal representation. - const BIT_WIDTH: usize; - /// The maximum precision. - const MAX_PRECISION: usize; - /// The maximum scale. - const MAX_SCALE: usize; +#[derive(Debug)] +pub struct BasicDecimal { + precision: usize, + scale: usize, + value: [u8; BYTE_WIDTH], +} + +impl BasicDecimal { + #[allow(clippy::type_complexity)] + const MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE: ( + usize, + usize, + fn(usize, usize) -> DataType, + DataType, + ) = match BYTE_WIDTH { + 16 => ( + DECIMAL128_MAX_PRECISION, + DECIMAL128_MAX_SCALE, + DataType::Decimal128, + DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + ), + 32 => ( + DECIMAL256_MAX_PRECISION, + DECIMAL256_MAX_SCALE, + DataType::Decimal256, + DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + ), + _ => panic!("invalid byte width"), + }; + + pub const MAX_PRECISION: usize = Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.0; + pub const MAX_SCALE: usize = Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.1; + pub const TYPE_CONSTRUCTOR: fn(usize, usize) -> DataType = + Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.2; + pub const DEFAULT_TYPE: DataType = + Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.3; /// Tries to create a decimal value from precision, scale and bytes. /// If the length of bytes isn't same as the bit width of this decimal, @@ -41,7 +70,11 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { /// Safety: /// This method doesn't validate if the decimal value represented by the bytes /// can be fitted into the specified precision. - fn try_new_from_bytes(precision: usize, scale: usize, bytes: &[u8]) -> Result + pub fn try_new_from_bytes( + precision: usize, + scale: usize, + bytes: &[u8; BYTE_WIDTH], + ) -> Result where Self: Sized, { @@ -67,13 +100,13 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { ))); } - if bytes.len() == Self::BIT_WIDTH / 8 { + if bytes.len() == BYTE_WIDTH { Ok(Self::new(precision, scale, bytes)) } else { Err(ArrowError::InvalidArgumentError(format!( "Input to Decimal{} must be {} bytes", - Self::BIT_WIDTH, - Self::BIT_WIDTH / 8 + BYTE_WIDTH * 8, + BYTE_WIDTH ))) } } @@ -83,21 +116,33 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { /// Safety: /// This method doesn't check if the length of bytes is compatible with this decimal. /// Use `try_new_from_bytes` for safe constructor. - fn new(precision: usize, scale: usize, bytes: &[u8]) -> Self; - + pub fn new(precision: usize, scale: usize, bytes: &[u8]) -> Self { + Self { + precision, + scale, + value: bytes.try_into().unwrap(), + } + } /// Returns the raw bytes of the integer representation of the decimal. - fn raw_value(&self) -> &[u8]; + pub fn raw_value(&self) -> &[u8] { + &self.value + } /// Returns the precision of the decimal. - fn precision(&self) -> usize; + pub fn precision(&self) -> usize { + self.precision + } /// Returns the scale of the decimal. - fn scale(&self) -> usize; + pub fn scale(&self) -> usize { + self.scale + } /// Returns the string representation of the decimal. /// If the string representation cannot be fitted with the precision of the decimal, /// the string will be truncated. - fn to_string(&self) -> String { + #[allow(clippy::inherent_to_string)] + pub fn to_string(&self) -> String { let raw_bytes = self.raw_value(); let integer = BigInt::from_signed_bytes_le(raw_bytes); let value_str = integer.to_string(); @@ -119,15 +164,44 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { } } +impl PartialOrd for BasicDecimal { + fn partial_cmp(&self, other: &Self) -> Option { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + Some(singed_cmp_le_bytes(&self.value, &other.value)) + } +} + +impl Ord for BasicDecimal { + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + singed_cmp_le_bytes(&self.value, &other.value) + } +} + +impl PartialEq for BasicDecimal { + fn eq(&self, other: &Self) -> bool { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + self.value.eq(&other.value) + } +} + +impl Eq for BasicDecimal {} + /// Represents a decimal value with precision and scale. /// The decimal value could represented by a signed 128-bit integer. -#[derive(Debug)] -pub struct Decimal128 { - #[allow(dead_code)] - precision: usize, - scale: usize, - value: [u8; 16], -} +pub type Decimal128 = BasicDecimal<16>; impl Decimal128 { /// Creates `Decimal128` from an `i128` value. @@ -154,13 +228,7 @@ impl From for i128 { /// Represents a decimal value with precision and scale. /// The decimal value could be represented by a signed 256-bit integer. -#[derive(Debug)] -pub struct Decimal256 { - #[allow(dead_code)] - precision: usize, - scale: usize, - value: [u8; 32], -} +pub type Decimal256 = BasicDecimal<32>; impl Decimal256 { /// Constructs a `Decimal256` value from a `BigInt`. @@ -170,84 +238,25 @@ impl Decimal256 { scale: usize, ) -> Result { let mut bytes = if num.is_negative() { - vec![255; 32] + [255_u8; 32] } else { - vec![0; 32] + [0; 32] }; let num_bytes = &num.to_signed_bytes_le(); bytes[0..num_bytes.len()].clone_from_slice(num_bytes); Decimal256::try_new_from_bytes(precision, scale, &bytes) } -} - -macro_rules! def_decimal { - ($ty:ident, $bit:expr, $max_p:expr, $max_s:expr) => { - impl BasicDecimal for $ty { - const BIT_WIDTH: usize = $bit; - const MAX_PRECISION: usize = $max_p; - const MAX_SCALE: usize = $max_s; - - fn new(precision: usize, scale: usize, bytes: &[u8]) -> Self { - $ty { - precision, - scale, - value: bytes.try_into().unwrap(), - } - } - - fn raw_value(&self) -> &[u8] { - &self.value - } - - fn precision(&self) -> usize { - self.precision - } - fn scale(&self) -> usize { - self.scale - } - } - - impl PartialOrd for $ty { - fn partial_cmp(&self, other: &Self) -> Option { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - Some(singed_cmp_le_bytes(&self.value, &other.value)) - } - } - - impl Ord for $ty { - fn cmp(&self, other: &Self) -> Ordering { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - singed_cmp_le_bytes(&self.value, &other.value) - } - } - - impl PartialEq for $ty { - fn eq(&self, other: &Self) -> bool { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - self.value.eq(&other.value) - } - } - - impl Eq for $ty {} - }; + /// Constructs a `BigInt` from this `Decimal256` value. + pub(crate) fn to_big_int(&self) -> BigInt { + BigInt::from_signed_bytes_le(&self.value) + } } // compare two signed integer which are encoded with little endian. // left bytes and right bytes must have the same length. -fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { +#[inline] +pub(crate) fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { assert_eq!( left.len(), right.len(), @@ -286,24 +295,9 @@ fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { Ordering::Equal } -def_decimal!( - Decimal128, - 128, - DECIMAL128_MAX_PRECISION, - DECIMAL128_MAX_SCALE -); -def_decimal!( - Decimal256, - 256, - DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE -); - #[cfg(test)] mod tests { - use crate::util::decimal::{ - singed_cmp_le_bytes, BasicDecimal, Decimal128, Decimal256, - }; + use super::*; use num::{BigInt, Num}; use rand::random; @@ -356,9 +350,9 @@ mod tests { #[test] fn decimal_256_from_bytes() { - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&100_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(5, 2, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); assert_eq!(value.to_string(), "1.00"); bytes[0..16].clone_from_slice(&i128::MAX.to_le_bytes()); @@ -378,7 +372,7 @@ mod tests { ); // smaller than i128 minimum - bytes = vec![255; 32]; + bytes = [255; 32]; bytes[31] = 128; let value = Decimal256::try_new_from_bytes(76, 4, &bytes).unwrap(); assert_eq!( @@ -386,7 +380,7 @@ mod tests { "-574437317700748313234121683441537667865831564552201235664496608164256541.5731" ); - bytes = vec![255; 32]; + bytes = [255; 32]; let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); assert_eq!(value.to_string(), "-0.01"); } diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index 26bc8a1923a6..aa4fd4200870 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -23,7 +23,6 @@ use std::fmt::Write; use std::sync::Arc; use crate::array::Array; -use crate::array::BasicDecimalArray; use crate::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Field, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index 0077b2fb72aa..ee5c947a2fff 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -19,13 +19,22 @@ //! //! These utilities define structs that read the integration JSON format for integration testing purposes. +use hex::decode; +use num::BigInt; +use num::Signed; use serde_derive::{Deserialize, Serialize}; -use serde_json::{Map as SJMap, Number as VNumber, Value}; +use serde_json::{Map as SJMap, Value}; +use std::collections::HashMap; +use std::sync::Arc; use crate::array::*; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::compute; use crate::datatypes::*; -use crate::error::Result; +use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchReader}; +use crate::util::bit_util; +use crate::util::decimal::Decimal256; /// A struct that represents an Arrow file with a schema and record batches #[derive(Deserialize, Serialize, Debug)] @@ -42,6 +51,8 @@ pub struct ArrowJson { #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonSchema { pub fields: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option>>, } /// Fields are left as JSON `Value` as they vary by `DataType` @@ -107,14 +118,14 @@ pub struct DictionaryIndexType { } /// A struct that partially reads the Arrow JSON record batch -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct ArrowJsonBatch { count: usize, pub columns: Vec, } /// A struct that partially reads the Arrow JSON dictionary batch -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] #[allow(non_snake_case)] pub struct ArrowJsonDictionaryBatch { pub id: i64, @@ -139,17 +150,45 @@ pub struct ArrowJsonColumn { impl ArrowJson { /// Compare the Arrow JSON with a record batch reader - pub fn equals_reader(&self, reader: &mut dyn RecordBatchReader) -> bool { + pub fn equals_reader(&self, reader: &mut dyn RecordBatchReader) -> Result { if !self.schema.equals_schema(&reader.schema()) { - return false; + return Ok(false); } - self.batches.iter().all(|col| { + + for json_batch in self.get_record_batches()?.into_iter() { let batch = reader.next(); match batch { - Some(Ok(batch)) => col.equals_batch(&batch), - _ => false, + Some(Ok(batch)) => { + if json_batch != batch { + println!("json: {:?}", json_batch); + println!("batch: {:?}", batch); + return Ok(false); + } + } + _ => return Ok(false), } - }) + } + + Ok(true) + } + + pub fn get_record_batches(&self) -> Result> { + let schema = self.schema.to_arrow_schema()?; + + let mut dictionaries = HashMap::new(); + self.dictionaries.iter().for_each(|dict_batches| { + dict_batches.iter().for_each(|d| { + dictionaries.insert(d.id, d.clone()); + }); + }); + + let batches: Result> = self + .batches + .iter() + .map(|col| record_batch_from_json(&schema, col.clone(), Some(&dictionaries))) + .collect(); + + batches } } @@ -169,6 +208,28 @@ impl ArrowJsonSchema { } true } + + fn to_arrow_schema(&self) -> Result { + let arrow_fields: Result> = self + .fields + .iter() + .map(|field| field.to_arrow_field()) + .collect(); + + if let Some(metadatas) = &self.metadata { + let mut metadata: HashMap = HashMap::new(); + + metadatas.iter().for_each(|pair| { + let key = pair.get("key").unwrap(); + let value = pair.get("value").unwrap(); + metadata.insert(key.clone(), value.clone()); + }); + + Ok(Schema::new_with_metadata(arrow_fields?, metadata)) + } else { + Ok(Schema::new(arrow_fields?)) + } + } } impl ArrowJsonField { @@ -199,251 +260,731 @@ impl ArrowJsonField { } } -impl ArrowJsonBatch { - /// Compare the Arrow JSON record batch with a `RecordBatch` - fn equals_batch(&self, batch: &RecordBatch) -> bool { - if self.count != batch.num_rows() { - return false; +pub fn record_batch_from_json( + schema: &Schema, + json_batch: ArrowJsonBatch, + json_dictionaries: Option<&HashMap>, +) -> Result { + let mut columns = vec![]; + + for (field, json_col) in schema.fields().iter().zip(json_batch.columns) { + let col = array_from_json(field, json_col, json_dictionaries)?; + columns.push(col); + } + + RecordBatch::try_new(Arc::new(schema.clone()), columns) +} + +/// Construct an Arrow array from a partially typed JSON column +pub fn array_from_json( + field: &Field, + json_col: ArrowJsonColumn, + dictionaries: Option<&HashMap>, +) -> Result { + match field.data_type() { + DataType::Null => Ok(Arc::new(NullArray::new(json_col.count))), + DataType::Boolean => { + let mut b = BooleanBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_bool().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) } - let num_columns = self.columns.len(); - if num_columns != batch.num_columns() { - return false; + DataType::Int8 => { + let mut b = Int8Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to get {:?} as int64", + value + )) + })? as i8), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) } - let schema = batch.schema(); - self.columns - .iter() - .zip(batch.columns()) - .zip(schema.fields()) - .all(|((col, arr), field)| { - // compare each column based on its type - if &col.name != field.name() { - return false; - } - let json_array: Vec = json_from_col(col, field.data_type()); - match field.data_type() { - DataType::Null => { - let arr: &NullArray = - arr.as_any().downcast_ref::().unwrap(); - // NullArrays should have the same length, json_array is empty - arr.len() == col.count - } - DataType::Boolean => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int16 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int32 | DataType::Date32 | DataType::Time32(_) => { - let arr = Int32Array::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - let arr = Int64Array::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::YearMonth) => { - let arr = IntervalYearMonthArray::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::DayTime) => { - let arr = IntervalDayTimeArray::from(arr.data().clone()); - let x = json_array - .iter() - .map(|v| { - match v { - Value::Null => Value::Null, - Value::Object(v) => { - // interval has days and milliseconds - let days: i32 = - v.get("days").unwrap().as_i64().unwrap() - as i32; - let milliseconds: i32 = v - .get("milliseconds") - .unwrap() - .as_i64() - .unwrap() - as i32; - let value: i64 = unsafe { - std::mem::transmute::<[i32; 2], i64>([ - days, - milliseconds, - ]) - }; - Value::Number(VNumber::from(value)) + DataType::Int16 => { + let mut b = Int16Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().unwrap() as i16), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Int32 + | DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + let mut b = Int32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().unwrap() as i32), + _ => b.append_null(), + }; + } + let array = Arc::new(b.finish()) as ArrayRef; + compute::cast(&array, field.data_type()) + } + DataType::Int64 + | DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => { + let mut b = Int64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => { + s.parse().expect("Unable to parse string as i64") + } + Value::Object(ref map) + if map.contains_key("days") + && map.contains_key("milliseconds") => + { + match field.data_type() { + DataType::Interval(IntervalUnit::DayTime) => { + let days = map.get("days").unwrap(); + let milliseconds = map.get("milliseconds").unwrap(); + + match (days, milliseconds) { + (Value::Number(d), Value::Number(m)) => { + let mut bytes = [0_u8; 8]; + let m = (m.as_i64().unwrap() as i32) + .to_le_bytes(); + let d = (d.as_i64().unwrap() as i32) + .to_le_bytes(); + + let c = [d, m].concat(); + bytes.copy_from_slice(c.as_slice()); + i64::from_le_bytes(bytes) + } + _ => panic!( + "Unable to parse {:?} as interval daytime", + value + ), } - // return null if Value is not an object - _ => Value::Null, } - }) - .collect::>(); - arr.equals_json(&x.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let arr = IntervalMonthDayNanoArray::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt16 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt32 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt64 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Float32 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Float64 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Binary => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeBinary => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::FixedSizeBinary(_) => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Utf8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeUtf8 => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::List(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeList(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::FixedSizeList(_, _) => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Struct(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Map(_, _) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Decimal128(_, _) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { - DataType::Int8 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int16 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int32 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int64 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) + _ => panic!( + "Unable to parse {:?} as interval daytime", + value + ), + } } - DataType::UInt8 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], + _ => panic!("Unable to parse {:?} as number", value), + }), + _ => b.append_null(), + }; + } + let array = Arc::new(b.finish()) as ArrayRef; + compute::cast(&array, field.data_type()) + } + DataType::UInt8 => { + let mut b = UInt8Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u8), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt16 => { + let mut b = UInt16Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u16), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt32 => { + let mut b = UInt32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u32), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt64 => { + let mut b = UInt64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + if value.is_string() { + b.append_value( + value + .as_str() + .unwrap() + .parse() + .expect("Unable to parse string as u64"), ) - } - DataType::UInt16 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], + } else if value.is_number() { + b.append_value( + value.as_u64().expect("Unable to read number as u64"), ) + } else { + panic!("Unable to parse value {:?} as u64", value) } - DataType::UInt32 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Object(v) => { + let months = v.get("months").unwrap(); + let days = v.get("days").unwrap(); + let nanoseconds = v.get("nanoseconds").unwrap(); + match (months, days, nanoseconds) { + ( + Value::Number(months), + Value::Number(days), + Value::Number(nanoseconds), + ) => { + let months = months.as_i64().unwrap() as i32; + let days = days.as_i64().unwrap() as i32; + let nanoseconds = nanoseconds.as_i64().unwrap(); + let months_days_ns: i128 = ((nanoseconds as i128) + & 0xFFFFFFFFFFFFFFFF) + << 64 + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); + months_days_ns + } + (_, _, _) => { + panic!("Unable to parse {:?} as MonthDayNano", v) + } + } } - DataType::UInt64 => { - let arr = arr - .as_any() - .downcast_ref::() + _ => panic!("Unable to parse {:?} as MonthDayNano", value), + }), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Float32 => { + let mut b = Float32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_f64().unwrap() as f32), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Float64 => { + let mut b = Float64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_f64().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Binary => { + let mut b = BinaryBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::LargeBinary => { + let mut b = LargeBinaryBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Utf8 => { + let mut b = StringBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::LargeUtf8 => { + let mut b = LargeStringBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::FixedSizeBinary(len) => { + let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = hex::decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v)? + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::List(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .offset(0) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(ListArray::from(list_data))) + } + DataType::LargeList(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| match v { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => s.parse::().unwrap(), + _ => panic!("64-bit offset must be either string or number"), + }) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .offset(0) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(LargeListArray::from(list_data))) + } + DataType::FixedSizeList(child_field, _) => { + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let null_buf = create_null_buf(&json_col); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(FixedSizeListArray::from(list_data))) + } + DataType::Struct(fields) => { + // construct struct with null data + let null_buf = create_null_buf(&json_col); + let mut array_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .null_bit_buffer(Some(null_buf)); + + for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(field, col, dictionaries)?; + array_data = array_data.add_child_data(array.into_data()); + } + + let array = StructArray::from(array_data.build().unwrap()); + Ok(Arc::new(array)) + } + DataType::Dictionary(key_type, value_type) => { + let dict_id = field.dict_id().ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to find dict_id for field {:?}", + field + )) + })?; + // find dictionary + let dictionary = dictionaries + .ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to find any dictionaries for field {:?}", + field + )) + })? + .get(&dict_id); + match dictionary { + Some(dictionary) => dictionary_array_from_json( + field, + json_col, + key_type, + value_type, + dictionary, + dictionaries, + ), + None => Err(ArrowError::JsonError(format!( + "Unable to find dictionary for field {:?}", + field + ))), + } + } + DataType::Decimal128(precision, scale) => { + let mut b = Decimal128Builder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + b.append_value(value.as_str().unwrap().parse::().unwrap())? + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Decimal256(precision, scale) => { + let mut b = Decimal256Builder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let str = value.as_str().unwrap(); + let integer = BigInt::parse_bytes(str.as_bytes(), 10).unwrap(); + let integer_bytes = integer.to_signed_bytes_le(); + let mut bytes = if integer.is_positive() { + [0_u8; 32] + } else { + [255_u8; 32] + }; + bytes[0..integer_bytes.len()] + .copy_from_slice(integer_bytes.as_slice()); + let decimal = + Decimal256::try_new_from_bytes(*precision, *scale, &bytes) .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - t => panic!("Unsupported dictionary comparison for {:?}", t), - }, - t => panic!("Unsupported comparison for {:?}", t), + b.append_value(&decimal)?; + } + _ => b.append_null(), } - }) + } + Ok(Arc::new(b.finish())) + } + DataType::Map(child_field, _) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let array_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + + let array = MapArray::from(array_data); + Ok(Arc::new(array)) + } + DataType::Union(fields, field_type_ids, _) => { + let type_ids = if let Some(type_id) = json_col.type_id { + type_id + } else { + return Err(ArrowError::JsonError( + "Cannot find expected type_id in json column".to_string(), + )); + }; + + let offset: Option = json_col.offset.map(|offsets| { + let offsets: Vec = + offsets.iter().map(|v| v.as_i64().unwrap() as i32).collect(); + Buffer::from(&offsets.to_byte_slice()) + }); + + let mut children: Vec<(Field, Arc)> = vec![]; + for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(field, col, dictionaries)?; + children.push((field.clone(), array)); + } + + let array = UnionArray::try_new( + field_type_ids, + Buffer::from(&type_ids.to_byte_slice()), + offset, + children, + ) + .unwrap(); + Ok(Arc::new(array)) + } + t => Err(ArrowError::JsonError(format!( + "data type {:?} not supported", + t + ))), + } +} + +pub fn dictionary_array_from_json( + field: &Field, + json_col: ArrowJsonColumn, + dict_key: &DataType, + dict_value: &DataType, + dictionary: &ArrowJsonDictionaryBatch, + dictionaries: Option<&HashMap>, +) -> Result { + match dict_key { + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => { + let null_buf = create_null_buf(&json_col); + + // build the key data into a buffer, then construct values separately + let key_field = Field::new_dict( + "key", + dict_key.clone(), + field.is_nullable(), + field + .dict_id() + .expect("Dictionary fields must have a dict_id value"), + field + .dict_is_ordered() + .expect("Dictionary fields must have a dict_is_ordered value"), + ); + let keys = array_from_json(&key_field, json_col, None)?; + // note: not enough info on nullability of dictionary + let value_field = Field::new("value", dict_value.clone(), true); + let values = array_from_json( + &value_field, + dictionary.data.columns[0].clone(), + dictionaries, + )?; + + // convert key and value to dictionary data + let dict_data = ArrayData::builder(field.data_type().clone()) + .len(keys.len()) + .add_buffer(keys.data().buffers()[0].clone()) + .null_bit_buffer(Some(null_buf)) + .add_child_data(values.into_data()) + .build() + .unwrap(); + + let array = match dict_key { + DataType::Int8 => { + Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef + } + DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), + DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), + DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), + DataType::UInt8 => Arc::new(UInt8DictionaryArray::from(dict_data)), + DataType::UInt16 => Arc::new(UInt16DictionaryArray::from(dict_data)), + DataType::UInt32 => Arc::new(UInt32DictionaryArray::from(dict_data)), + DataType::UInt64 => Arc::new(UInt64DictionaryArray::from(dict_data)), + _ => unreachable!(), + }; + Ok(array) + } + _ => Err(ArrowError::JsonError(format!( + "Dictionary key type {:?} not supported", + dict_key + ))), } +} +/// A helper to create a null buffer from a Vec +fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { + let num_bytes = bit_util::ceil(json_col.count, 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + json_col + .validity + .clone() + .unwrap() + .iter() + .enumerate() + .for_each(|(i, v)| { + let null_slice = null_buf.as_slice_mut(); + if *v != 0 { + bit_util::set_bit(null_slice, i); + } + }); + null_buf.into() +} + +impl ArrowJsonBatch { pub fn from_batch(batch: &RecordBatch) -> ArrowJsonBatch { let mut json_batch = ArrowJsonBatch { count: batch.num_rows(), @@ -496,217 +1037,6 @@ impl ArrowJsonBatch { } } -/// Convert an Arrow JSON column/array into a vector of `Value` -fn json_from_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec { - match data_type { - DataType::List(field) => json_from_list_col(col, field.data_type()), - DataType::FixedSizeList(field, list_size) => { - json_from_fixed_size_list_col(col, field.data_type(), *list_size as usize) - } - DataType::Struct(fields) => json_from_struct_col(col, fields), - DataType::Map(field, keys_sorted) => json_from_map_col(col, field, *keys_sorted), - DataType::Int64 - | DataType::UInt64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - // convert int64 data from strings to numbers - let converted_col: Vec = col - .data - .clone() - .unwrap() - .iter() - .map(|v| { - Value::Number(match v { - Value::Number(number) => number.clone(), - Value::String(string) => VNumber::from( - string - .parse::() - .expect("Unable to parse string as i64"), - ), - t => panic!("Cannot convert {} to number", t), - }) - }) - .collect(); - merge_json_array( - col.validity.as_ref().unwrap().as_slice(), - converted_col.as_slice(), - ) - } - DataType::Null => vec![], - _ => merge_json_array( - col.validity.as_ref().unwrap().as_slice(), - &col.data.clone().unwrap(), - ), - } -} - -/// Merge VALIDITY and DATA vectors from a primitive data type into a `Value` vector with nulls -fn merge_json_array(validity: &[u8], data: &[Value]) -> Vec { - validity - .iter() - .zip(data) - .map(|(v, d)| match v { - 0 => Value::Null, - 1 => d.clone(), - _ => panic!("Validity data should be 0 or 1"), - }) - .collect() -} - -/// Convert an Arrow JSON column/array of a `DataType::Struct` into a vector of `Value` -fn json_from_struct_col(col: &ArrowJsonColumn, fields: &[Field]) -> Vec { - let mut values = Vec::with_capacity(col.count); - - let children: Vec> = col - .children - .clone() - .unwrap() - .iter() - .zip(fields) - .map(|(child, field)| json_from_col(child, field.data_type())) - .collect(); - - // create a struct from children - for j in 0..col.count { - let mut map = serde_json::map::Map::new(); - for i in 0..children.len() { - map.insert(fields[i].name().to_string(), children[i][j].clone()); - } - values.push(Value::Object(map)); - } - - values -} - -/// Convert an Arrow JSON column/array of a `DataType::List` into a vector of `Value` -fn json_from_list_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let offsets: Vec = col - .offset - .clone() - .unwrap() - .iter() - .map(|o| match o { - Value::String(s) => s.parse::().unwrap(), - Value::Number(n) => n.as_u64().unwrap() as usize, - _ => panic!( - "Offsets should be numbers or strings that are convertible to numbers" - ), - }) - .collect(); - let inner = match data_type { - DataType::List(ref field) => json_from_col(child, field.data_type()), - DataType::Struct(fields) => json_from_struct_col(col, fields), - _ => merge_json_array( - child.validity.as_ref().unwrap().as_slice(), - &child.data.clone().unwrap(), - ), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => { - values.push(Value::Array(inner[offsets[i]..offsets[i + 1]].to_vec())) - } - _ => panic!("Validity data should be 0 or 1"), - }, - None => { - // Null type does not have a validity vector - } - } - } - - values -} - -/// Convert an Arrow JSON column/array of a `DataType::List` into a vector of `Value` -fn json_from_fixed_size_list_col( - col: &ArrowJsonColumn, - data_type: &DataType, - list_size: usize, -) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let inner = match data_type { - DataType::List(ref field) => json_from_col(child, field.data_type()), - DataType::FixedSizeList(ref field, _) => json_from_col(child, field.data_type()), - DataType::Struct(fields) => json_from_struct_col(col, fields), - _ => merge_json_array( - child.validity.as_ref().unwrap().as_slice(), - &child.data.clone().unwrap(), - ), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => values.push(Value::Array( - inner[(list_size * i)..(list_size * (i + 1))].to_vec(), - )), - _ => panic!("Validity data should be 0 or 1"), - }, - None => {} - } - } - - values -} - -fn json_from_map_col( - col: &ArrowJsonColumn, - field: &Field, - _keys_sorted: bool, -) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let offsets: Vec = col - .offset - .clone() - .unwrap() - .iter() - .map(|o| match o { - Value::String(s) => s.parse::().unwrap(), - Value::Number(n) => n.as_u64().unwrap() as usize, - _ => panic!( - "Offsets should be numbers or strings that are convertible to numbers" - ), - }) - .collect(); - - let inner = match field.data_type() { - DataType::Struct(fields) => json_from_struct_col(child, fields), - _ => panic!("Map child must be Struct"), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => { - values.push(Value::Array(inner[offsets[i]..offsets[i + 1]].to_vec())) - } - _ => panic!("Validity data should be 0 or 1"), - }, - None => { - // Null type does not have a validity vector - } - } - } - - values -} #[cfg(test)] mod tests { use super::*; @@ -945,22 +1275,25 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data.into_data()) + .null_bit_buffer(Some(Buffer::from([0b00000011]))) .build() .unwrap(); let lists = ListArray::from(list_data); let structs_int32s = Int32Array::from(vec![None, Some(-2), None]); let structs_utf8s = StringArray::from(vec![None, None, Some("aaaaaa")]); - let structs = StructArray::from(vec![ - ( - Field::new("int32s", DataType::Int32, true), - Arc::new(structs_int32s) as ArrayRef, - ), - ( - Field::new("utf8s", DataType::Utf8, true), - Arc::new(structs_utf8s) as ArrayRef, - ), + let struct_data_type = DataType::Struct(vec![ + Field::new("int32s", DataType::Int32, true), + Field::new("utf8s", DataType::Utf8, true), ]); + let struct_data = ArrayData::builder(struct_data_type) + .len(3) + .add_child_data(structs_int32s.data().clone()) + .add_child_data(structs_utf8s.data().clone()) + .null_bit_buffer(Some(Buffer::from([0b00000011]))) + .build() + .unwrap(); + let structs = StructArray::from(struct_data); let record_batch = RecordBatch::try_new( Arc::new(schema.clone()), @@ -1005,6 +1338,6 @@ mod tests { // test schemas assert!(arrow_json.schema.equals_schema(&schema)); // test record batch - assert!(arrow_json.batches[0].equals_batch(&record_batch)); + assert_eq!(arrow_json.get_record_batches().unwrap()[0], record_batch); } } diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 84d445e9a1f8..6f4d9e34a99b 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -107,7 +107,7 @@ fn create_column(field: &str, columns: &[ArrayRef]) -> Result { mod tests { use crate::{ array::{ - self, new_null_array, Array, BasicDecimalArray, Date32Array, Date64Array, + self, new_null_array, Array, Date32Array, Date64Array, FixedSizeBinaryBuilder, Float16Array, Int32Array, PrimitiveBuilder, StringArray, StringBuilder, StringDictionaryBuilder, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, diff --git a/dev/release/README.md b/dev/release/README.md index 592d4c39fab9..4ffa85d2abaa 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -21,10 +21,23 @@ ## Overview -We try to release a new version of Arrow every two weeks. This cadence balances getting new features into arrow without overwhelming downstream projects with too frequent changes. +This file documents the release process for: + +1. The "Rust Arrow Crates": `arrow`, `arrow-flight`, `parquet`, and `parquet-derive`. +2. The `object_store` crate. + +### The Rust Arrow Crates + +The Rust Arrow Crates are interconnected (e.g. `parquet` has an optional dependency on `arrow`) so we increment and release all of them together. We try to release a new version of "Rust Arrow Crates" every two weeks. This cadence balances getting new features into the community without overwhelming downstream projects with too frequent changes or overly burdening maintainers. If any code has been merged to master that has a breaking API change, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md), the major version number incremented changed (e.g. `9.0.2` to `9.0.2`). Otherwise the new minor version incremented (e.g. `9.0.2` to `7.1.0`). +### `object_store` crate + +At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. + +As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to master that has a breaking API change, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md), the minor version number incremented changed (e.g. `0.3.0` to `0.4.0`). Otherwise the patch version is incremented (e.g. `0.3.0` to `0.3.1`). + # Release Mechanics ## Process Overview @@ -47,13 +60,17 @@ labels associated with them. Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. -See [#1141](https://github.com/apache/arrow-rs/pull/1141) for an example. +For the Rust Arrow crates, do this in the root of this repository. For example [#2323](https://github.com/apache/arrow-rs/pull/2323) + +For `object_store` the same process is done in the `object_store` directory. Examples TBD ```bash git checkout master git pull git checkout -b make-release +# Move the content of CHANGELOG.md to CHANGELOG-old.md + # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh @@ -61,7 +78,7 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/19.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/20.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` @@ -82,7 +99,11 @@ distribution servers. While the official release artifact is a signed tarball, we also tag the commit it was created for convenience and code archaeology. -Using a string such as `4.0.1` as the ``, create and push the tag thusly: +For a Rust Arrow Crates release, use a string such as `4.0.1` as the ``. + +For `object_store` releases, use a string such as `object_store_0.4.0` as the ``. + +Create and push the tag thusly: ```shell git fetch apache @@ -97,12 +118,20 @@ Pick numbers in sequential order, with `1` for `rc1`, `2` for `rc2`, etc. ### Create, sign, and upload tarball -Run `create-tarball.sh` with the `` tag and `` and you found in previous steps: +Run `create-tarball.sh` with the `` tag and `` and you found in previous steps. + +Rust Arrow Crates: ```shell ./dev/release/create-tarball.sh 4.1.0 2 ``` +`object_store`: + +```shell +./object_store/dev/release/create-tarball.sh 4.1.0 2 +``` + The `create-tarball.sh` script 1. creates and uploads a release candidate tarball to the [arrow @@ -114,7 +143,7 @@ The `create-tarball.sh` script ### Vote on Release Candidate tarball -Send the email output from the script to dev@arrow.apache.org. The email should look like +Send an email, based on the output from the script to dev@arrow.apache.org. The email should look like ``` To: dev@arrow.apache.org @@ -144,11 +173,11 @@ The vote will be open for at least 72 hours. [3]: https://github.com/apache/arrow-rs/blob/a5dd428f57e62db20a945e8b1895de91405958c4/CHANGELOG.md ``` -For the release to become "official" it needs at least three PMC members to vote +1 on it. +For the release to become "official" it needs at least three Apache Arrow PMC members to vote +1 on it. ## Verifying release candidates -The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like: +The `dev/release/verify-release-candidate.sh` or `object_store/dev/release/verify-release-candidate.sh` are scripts in this repository that can assist in the verification process. Run it like: ``` ./dev/release/verify-release-candidate.sh 4.1.0 2 @@ -162,10 +191,18 @@ If the release is not approved, fix whatever the problem is and try again with t Move tarball to the release location in SVN, e.g. https://dist.apache.org/repos/dist/release/arrow/arrow-4.1.0/, using the `release-tarball.sh` script: +Rust Arrow Crates: + ```shell ./dev/release/release-tarball.sh 4.1.0 2 ``` +`object_store` + +```shell +./object_store/dev/release/release-tarball.sh 4.1.0 2 +``` + Congratulations! The release is now offical! ### Publish on Crates.io @@ -188,9 +225,17 @@ Verify that the Cargo.toml in the tarball contains the correct version (e.g. `version = "0.11.0"`) and then publish the crate with the following commands +Rust Arrow Crates: + ```shell (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) (cd parquet_derive && cargo publish) ``` + +`object_store` + +```shell +cargo publish +``` diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index dc3d9e4e4a2d..b2ca561e073d 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="18.0.0" -FUTURE_RELEASE="19.0.0" +SINCE_TAG="19.0.0" +FUTURE_RELEASE="20.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -40,6 +40,8 @@ OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" # remove license header so github-changelog-generator has a clean base to append sed -i.bak '1,18d' "${OUTPUT_PATH}" +# use exclude-tags-regex to filter out tags used for object_store +# crates and only only look at tags that DO NOT begin with `object_store_` pushd "${SOURCE_TOP_DIR}" docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ --user apache \ @@ -48,6 +50,7 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ --max-issues=300 \ + --exclude-tags-regex "^object_store_\d+\.\d+\.\d+$" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index a5ed04c6f8b8..b60465b9732c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -72,24 +72,6 @@ fetch_archive() { ${sha512_verify} ${dist_name}.tar.gz.sha512 } -verify_dir_artifact_signatures() { - # verify the signature and the checksums of each artifact - find $1 -name '*.asc' | while read sigfile; do - artifact=${sigfile/.asc/} - gpg --verify $sigfile $artifact || exit 1 - - # go into the directory because the checksum files contain only the - # basename of the artifact - pushd $(dirname $artifact) - base_artifact=$(basename $artifact) - if [ -f $base_artifact.sha256 ]; then - ${sha256_verify} $base_artifact.sha256 || exit 1 - fi - ${sha512_verify} $base_artifact.sha512 || exit 1 - popd - done -} - setup_tempdir() { cleanup() { if [ "${TEST_SUCCESS}" = "yes" ]; then diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 76a726fe2de5..12892badcd27 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "19.0.0" +version = "20.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/integration-testing/src/bin/arrow-json-integration-test.rs b/integration-testing/src/bin/arrow-json-integration-test.rs index 69b73b19f222..b442e8b5ed30 100644 --- a/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/integration-testing/src/bin/arrow-json-integration-test.rs @@ -91,7 +91,10 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> for f in reader.schema().fields() { fields.push(ArrowJsonField::from(f)); } - let schema = ArrowJsonSchema { fields }; + let schema = ArrowJsonSchema { + fields, + metadata: None, + }; let batches = reader .map(|batch| Ok(ArrowJsonBatch::from_batch(&batch?))) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 7be70bfa2474..5d3da15d3f50 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -17,29 +17,17 @@ //! Common code used in the integration test binaries -use hex::decode; use serde_json::Value; use arrow::util::integration_util::ArrowJsonBatch; -use arrow::array::*; -use arrow::datatypes::{DataType, Field, IntervalUnit, Schema}; -use arrow::error::{ArrowError, Result}; +use arrow::datatypes::Schema; +use arrow::error::Result; use arrow::record_batch::RecordBatch; -use arrow::{ - buffer::Buffer, - buffer::MutableBuffer, - datatypes::ToByteSlice, - util::{bit_util, integration_util::*}, -}; - -use arrow::util::decimal::{BasicDecimal, Decimal256}; -use num::bigint::BigInt; -use num::Signed; +use arrow::util::integration_util::*; use std::collections::HashMap; use std::fs::File; use std::io::BufReader; -use std::sync::Arc; /// The expected username for the basic auth integration test. pub const AUTH_USERNAME: &str = "arrow"; @@ -88,717 +76,3 @@ pub fn read_json_file(json_name: &str) -> Result { batches, }) } - -fn record_batch_from_json( - schema: &Schema, - json_batch: ArrowJsonBatch, - json_dictionaries: Option<&HashMap>, -) -> Result { - let mut columns = vec![]; - - for (field, json_col) in schema.fields().iter().zip(json_batch.columns) { - let col = array_from_json(field, json_col, json_dictionaries)?; - columns.push(col); - } - - RecordBatch::try_new(Arc::new(schema.clone()), columns) -} - -/// Construct an Arrow array from a partially typed JSON column -fn array_from_json( - field: &Field, - json_col: ArrowJsonColumn, - dictionaries: Option<&HashMap>, -) -> Result { - match field.data_type() { - DataType::Null => Ok(Arc::new(NullArray::new(json_col.count))), - DataType::Boolean => { - let mut b = BooleanBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_bool().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int8 => { - let mut b = Int8Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to get {:?} as int64", - value - )) - })? as i8), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int16 => { - let mut b = Int16Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().unwrap() as i16), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - let mut b = Int32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().unwrap() as i32), - _ => b.append_null(), - }; - } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - let mut b = Int64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(match value { - Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => { - s.parse().expect("Unable to parse string as i64") - } - Value::Object(ref map) - if map.contains_key("days") - && map.contains_key("milliseconds") => - { - match field.data_type() { - DataType::Interval(IntervalUnit::DayTime) => { - let days = map.get("days").unwrap(); - let milliseconds = map.get("milliseconds").unwrap(); - - match (days, milliseconds) { - (Value::Number(d), Value::Number(m)) => { - let mut bytes = [0_u8; 8]; - let m = (m.as_i64().unwrap() as i32) - .to_le_bytes(); - let d = (d.as_i64().unwrap() as i32) - .to_le_bytes(); - - let c = [d, m].concat(); - bytes.copy_from_slice(c.as_slice()); - i64::from_le_bytes(bytes) - } - _ => panic!( - "Unable to parse {:?} as interval daytime", - value - ), - } - } - _ => panic!( - "Unable to parse {:?} as interval daytime", - value - ), - } - } - _ => panic!("Unable to parse {:?} as number", value), - }), - _ => b.append_null(), - }; - } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) - } - DataType::UInt8 => { - let mut b = UInt8Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u8), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt16 => { - let mut b = UInt16Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u16), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt32 => { - let mut b = UInt32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u32), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt64 => { - let mut b = UInt64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value( - value - .as_str() - .unwrap() - .parse() - .expect("Unable to parse string as u64"), - ), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(match value { - Value::Object(v) => { - let months = v.get("months").unwrap(); - let days = v.get("days").unwrap(); - let nanoseconds = v.get("nanoseconds").unwrap(); - match (months, days, nanoseconds) { - ( - Value::Number(months), - Value::Number(days), - Value::Number(nanoseconds), - ) => { - let months = months.as_i64().unwrap() as i32; - let days = days.as_i64().unwrap() as i32; - let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = ((nanoseconds as i128) - & 0xFFFFFFFFFFFFFFFF) - << 64 - | ((days as i128) & 0xFFFFFFFF) << 32 - | ((months as i128) & 0xFFFFFFFF); - months_days_ns - } - (_, _, _) => { - panic!("Unable to parse {:?} as MonthDayNano", v) - } - } - } - _ => panic!("Unable to parse {:?} as MonthDayNano", value), - }), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Float32 => { - let mut b = Float32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_f64().unwrap() as f32), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Float64 => { - let mut b = Float64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_f64().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Binary => { - let mut b = BinaryBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v) - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::LargeBinary => { - let mut b = LargeBinaryBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v) - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Utf8 => { - let mut b = StringBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_str().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::LargeUtf8 => { - let mut b = LargeStringBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_str().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::FixedSizeBinary(len) => { - let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = hex::decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v)? - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::List(child_field) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| v.as_i64().unwrap() as i32) - .collect(); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .offset(0) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(ListArray::from(list_data))) - } - DataType::LargeList(child_field) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| match v { - Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => s.parse::().unwrap(), - _ => panic!("64-bit offset must be either string or number"), - }) - .collect(); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .offset(0) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(LargeListArray::from(list_data))) - } - DataType::FixedSizeList(child_field, _) => { - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let null_buf = create_null_buf(&json_col); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(FixedSizeListArray::from(list_data))) - } - DataType::Struct(fields) => { - // construct struct with null data - let null_buf = create_null_buf(&json_col); - let mut array_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .null_bit_buffer(Some(null_buf)); - - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(field, col, dictionaries)?; - array_data = array_data.add_child_data(array.into_data()); - } - - let array = StructArray::from(array_data.build().unwrap()); - Ok(Arc::new(array)) - } - DataType::Dictionary(key_type, value_type) => { - let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find dict_id for field {:?}", - field - )) - })?; - // find dictionary - let dictionary = dictionaries - .ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find any dictionaries for field {:?}", - field - )) - })? - .get(&dict_id); - match dictionary { - Some(dictionary) => dictionary_array_from_json( - field, - json_col, - key_type, - value_type, - dictionary, - dictionaries, - ), - None => Err(ArrowError::JsonError(format!( - "Unable to find dictionary for field {:?}", - field - ))), - } - } - DataType::Decimal128(precision, scale) => { - let mut b = Decimal128Builder::new(json_col.count, *precision, *scale); - // C++ interop tests involve incompatible decimal values - unsafe { - b.disable_value_validation(); - } - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - b.append_value(value.as_str().unwrap().parse::().unwrap())? - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Decimal256(precision, scale) => { - let mut b = Decimal256Builder::new(json_col.count, *precision, *scale); - // C++ interop tests involve incompatible decimal values - unsafe { - b.disable_value_validation(); - } - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let str = value.as_str().unwrap(); - let integer = BigInt::parse_bytes(str.as_bytes(), 10).unwrap(); - let integer_bytes = integer.to_signed_bytes_le(); - let mut bytes = if integer.is_positive() { - [0_u8; 32] - } else { - [255_u8; 32] - }; - bytes[0..integer_bytes.len()] - .copy_from_slice(integer_bytes.as_slice()); - let decimal = - Decimal256::try_new_from_bytes(*precision, *scale, &bytes) - .unwrap(); - b.append_value(&decimal)?; - } - _ => b.append_null(), - } - } - Ok(Arc::new(b.finish())) - } - DataType::Map(child_field, _) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| v.as_i64().unwrap() as i32) - .collect(); - let array_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - - let array = MapArray::from(array_data); - Ok(Arc::new(array)) - } - DataType::Union(fields, field_type_ids, _) => { - let type_ids = if let Some(type_id) = json_col.type_id { - type_id - } else { - return Err(ArrowError::JsonError( - "Cannot find expected type_id in json column".to_string(), - )); - }; - - let offset: Option = json_col.offset.map(|offsets| { - let offsets: Vec = - offsets.iter().map(|v| v.as_i64().unwrap() as i32).collect(); - Buffer::from(&offsets.to_byte_slice()) - }); - - let mut children: Vec<(Field, Arc)> = vec![]; - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(field, col, dictionaries)?; - children.push((field.clone(), array)); - } - - let array = UnionArray::try_new( - field_type_ids, - Buffer::from(&type_ids.to_byte_slice()), - offset, - children, - ) - .unwrap(); - Ok(Arc::new(array)) - } - t => Err(ArrowError::JsonError(format!( - "data type {:?} not supported", - t - ))), - } -} - -fn dictionary_array_from_json( - field: &Field, - json_col: ArrowJsonColumn, - dict_key: &DataType, - dict_value: &DataType, - dictionary: &ArrowJsonDictionaryBatch, - dictionaries: Option<&HashMap>, -) -> Result { - match dict_key { - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 => { - let null_buf = create_null_buf(&json_col); - - // build the key data into a buffer, then construct values separately - let key_field = Field::new_dict( - "key", - dict_key.clone(), - field.is_nullable(), - field - .dict_id() - .expect("Dictionary fields must have a dict_id value"), - field - .dict_is_ordered() - .expect("Dictionary fields must have a dict_is_ordered value"), - ); - let keys = array_from_json(&key_field, json_col, None)?; - // note: not enough info on nullability of dictionary - let value_field = Field::new("value", dict_value.clone(), true); - let values = array_from_json( - &value_field, - dictionary.data.columns[0].clone(), - dictionaries, - )?; - - // convert key and value to dictionary data - let dict_data = ArrayData::builder(field.data_type().clone()) - .len(keys.len()) - .add_buffer(keys.data().buffers()[0].clone()) - .null_bit_buffer(Some(null_buf)) - .add_child_data(values.into_data()) - .build() - .unwrap(); - - let array = match dict_key { - DataType::Int8 => { - Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef - } - DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), - DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), - DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), - DataType::UInt8 => Arc::new(UInt8DictionaryArray::from(dict_data)), - DataType::UInt16 => Arc::new(UInt16DictionaryArray::from(dict_data)), - DataType::UInt32 => Arc::new(UInt32DictionaryArray::from(dict_data)), - DataType::UInt64 => Arc::new(UInt64DictionaryArray::from(dict_data)), - _ => unreachable!(), - }; - Ok(array) - } - _ => Err(ArrowError::JsonError(format!( - "Dictionary key type {:?} not supported", - dict_key - ))), - } -} - -/// A helper to create a null buffer from a Vec -fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { - let num_bytes = bit_util::ceil(json_col.count, 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); - json_col - .validity - .clone() - .unwrap() - .iter() - .enumerate() - .for_each(|(i, v)| { - let null_slice = null_buf.as_slice_mut(); - if *v != 0 { - bit_util::set_bit(null_slice, i); - } - }); - null_buf.into() -} diff --git a/object_store/.circleci/config.yml b/object_store/.circleci/config.yml deleted file mode 100644 index b4dff6d53acc..000000000000 --- a/object_store/.circleci/config.yml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# CI Overview -# ----------- -# -# Each night: -# -# A build image is created (ci_image) from `docker/Dockerfile.ci` and is -# pushed to `quay.io/influxdb/rust:ci`. This build image is then used to run -# the CI tasks for the day. -# -# Every commit: -# -# The CI for every PR and merge to main runs tests, fmt, lints and compiles debug binaries -# -# On main if all these checks pass it will then additionally compile in "release" mode and -# publish a docker image to quay.io/influxdb/iox:$COMMIT_SHA -# -# Manual CI Image: -# -# It is possible to manually trigger a rebuild of the image used in CI. To do this, navigate to -# https://app.circleci.com/pipelines/github/influxdata/influxdb_iox?branch=main (overriding the -# branch name if desired). Then: -# - Click "Run Pipeline" in the top-right -# - Expand "Add Parameters" -# - Add a "boolean" parameter called "ci_image" with the value true -# - Click "Run Pipeline" -# -# If you refresh the page you should see a newly running ci_image workflow -# - -version: 2.1 - -orbs: - win: circleci/windows@4.1 - -commands: - rust_components: - description: Verify installed components - steps: - - run: - name: Verify installed components - command: | - rustup --version - rustup show - cargo fmt --version - cargo clippy --version - - cache_restore: - description: Restore Cargo Cache - steps: - - restore_cache: - name: Restoring Cargo Cache - keys: - - cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - - cargo-cache-{{ arch }}-{{ .Branch }} - - cargo-cache - cache_save: - description: Save Cargo Cache - steps: - - save_cache: - name: Save Cargo Cache - paths: - - /usr/local/cargo/registry - key: cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - -jobs: - fmt: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Rust fmt - command: cargo fmt --all -- --check - - cache_save - lint: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Clippy - command: cargo clippy --all-targets --all-features --workspace -- -D warnings - - cache_save - cargo_audit: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-deny - command: cargo install --force cargo-deny - - run: - name: cargo-deny Checks - command: cargo deny check -s - - cache_save - check: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-hack - command: cargo install cargo-hack - - run: - name: Check all features - command: cargo hack check --feature-powerset --no-dev-deps --workspace - - cache_save - doc: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Cargo doc - # excluding datafusion because it's effectively a dependency masqueraded as workspace crate. - command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion - - cache_save - - run: - name: Compress Docs - command: tar -cvzf rustdoc.tar.gz target/doc/ - - store_artifacts: - path: rustdoc.tar.gz - test: - # setup multiple docker images (see https://circleci.com/docs/2.0/configuration-reference/#docker) - docker: - - image: quay.io/influxdb/rust:ci - - image: localstack/localstack:0.14.4 - - image: mcr.microsoft.com/azure-storage/azurite - - image: fsouza/fake-gcs-server - command: - - "-scheme" - - "http" - resource_class: 2xlarge # use of a smaller executor tends crashes on link - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - RUST_BACKTRACE: "1" - # Run integration tests - TEST_INTEGRATION: 1 - AWS_DEFAULT_REGION: "us-east-1" - AWS_ACCESS_KEY_ID: test - AWS_SECRET_ACCESS_KEY: test - AWS_ENDPOINT: http://127.0.0.1:4566 - AZURE_USE_EMULATOR: "1" - GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" - OBJECT_STORE_BUCKET: test-bucket - steps: - - run: - name: Setup localstack (AWS emulation) - command: | - cd /tmp - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket - - run: - name: Setup Azurite (Azure emulation) - # the magical connection string is from https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings - command: | - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - - run: - name: Setup fake GCS server - command: | - curl -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" - echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" - - checkout - - rust_components - - cache_restore - - run: - name: Cargo test - command: cargo test --workspace --features=aws,azure,azure_test,gcp - - cache_save - - test_windows: - executor: - name: win/default - size: medium - environment: - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - run: - name: Download rustup - command: wget https://win.rustup.rs/x86_64 -O rustup-init.exe - - run: - name: Install rustup - command: .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc - - run: - name: Cargo test - command: cargo test --workspace - -workflows: - version: 2 - - # CI for all pull requests. - ci: - jobs: - - check - - fmt - - lint - - cargo_audit - - test - - test_windows - - doc diff --git a/object_store/.github_changelog_generator b/object_store/.github_changelog_generator new file mode 100644 index 000000000000..cbd8aa0c4b48 --- /dev/null +++ b/object_store/.github_changelog_generator @@ -0,0 +1,27 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Add special sections for documentation, security and performance +add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"security":{"prefix":"**Security updates:**","labels":["security"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} +# so that the component is shown associated with the issue +issue-line-labels=object-store +# skip non object_store issues +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight +breaking_labels=api-change diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md new file mode 100644 index 000000000000..93faa678ffa8 --- /dev/null +++ b/object_store/CHANGELOG.md @@ -0,0 +1,70 @@ + + +# Changelog + +## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) + +**Implemented enhancements:** + +- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) +- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) +- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index aaf9ee947b54..ffb65aaa7ee7 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.3.0" +version = "0.4.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" @@ -46,9 +46,9 @@ serde = { version = "1.0", default-features = false, features = ["derive"], opti serde_json = { version = "1.0", default-features = false, optional = true } quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -ring = { version = "0.16", default-features = false, features = ["std"] } +ring = { version = "0.16", default-features = false, features = ["std"], optional = true } base64 = { version = "0.13", default-features = false, optional = true } -rand = { version = "0.8", default-features = false, optional = true, features = ["std", "std_rng"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } # for rusoto hyper = { version = "0.14", optional = true, default-features = false } # for rusoto @@ -63,7 +63,7 @@ rusoto_sts = { version = "0.48.0", optional = true, default-features = false, fe snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } -reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } parking_lot = { version = "0.12" } # Filesystem integration url = "2.2" @@ -72,7 +72,7 @@ walkdir = "2" [features] azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand"] +gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] [dev-dependencies] # In alphabetical order diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md new file mode 100644 index 000000000000..89f6e579b23d --- /dev/null +++ b/object_store/dev/release/README.md @@ -0,0 +1,20 @@ + + +See instructions in [`/dev/release/README.md`](../../../dev/release/README.md) diff --git a/object_store/dev/release/create-tarball.sh b/object_store/dev/release/create-tarball.sh new file mode 100755 index 000000000000..bbffde89b043 --- /dev/null +++ b/object_store/dev/release/create-tarball.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script creates a signed tarball in +# dev/dist/apache-arrow-object-store-rs--.tar.gz and uploads it to +# the "dev" area of the dist.apache.arrow repository and prepares an +# email for sending to the dev@arrow.apache.org list for a formal +# vote. +# +# Note the tags are expected to be `object_sore_` +# +# See release/README.md for full release instructions +# +# Requirements: +# +# 1. gpg setup for signing and have uploaded your public +# signature to https://pgp.mit.edu/ +# +# 2. Logged into the apache svn server with the appropriate +# credentials +# +# +# Based in part on 02-source.sh from apache/arrow +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +object_store_version=$1 +rc=$2 + +tag=object_store_${object_store_version} + +release=apache-arrow-object-store-rs-${object_store_version} +distdir=${SOURCE_TOP_DIR}/dev/dist/${release}-rc${rc} +tarname=${release}.tar.gz +tarball=${distdir}/${tarname} +url="https://dist.apache.org/repos/dist/dev/arrow/${release}-rc${rc}" + +echo "Attempting to create ${tarball} from tag ${tag}" + +release_hash=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" +fi + +echo "Draft email for dev@arrow.apache.org mailing list" +echo "" +echo "---------------------------------------------------------" +cat < containing the files in git at $release_hash +# the files in the tarball are prefixed with {object_store_version=} (e.g. 0.4.0) +mkdir -p ${distdir} +(cd "${SOURCE_TOP_DIR}" && git archive ${release_hash} --prefix ${release}/ | gzip > ${tarball}) + +echo "Running rat license checker on ${tarball}" +${SOURCE_DIR}/../../../dev/release/run-rat.sh ${tarball} + +echo "Signing tarball and creating checksums" +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +# create signing with relative path of tarball +# so that they can be verified with a command such as +# shasum --check apache-arrow-rs-4.1.0-rc2.tar.gz.sha512 +(cd ${distdir} && shasum -a 256 ${tarname}) > ${tarball}.sha256 +(cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 + +echo "Uploading to apache dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow ${SOURCE_TOP_DIR}/dev/dist +svn add ${distdir} +svn ci -m "Apache Arrow Rust ${object_store_version=} ${rc}" ${distdir} diff --git a/object_store/dev/release/release-tarball.sh b/object_store/dev/release/release-tarball.sh new file mode 100755 index 000000000000..75ff886c6b1e --- /dev/null +++ b/object_store/dev/release/release-tarball.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script copies a tarball from the "dev" area of the +# dist.apache.arrow repository to the "release" area +# +# This script should only be run after the release has been approved +# by the arrow PMC committee. +# +# See release/README.md for full release instructions +# +# Based in part on post-01-upload.sh from apache/arrow + + +set -e +set -u + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +version=$1 +rc=$2 + +tmp_dir=tmp-apache-arrow-dist + +echo "Recreate temporary directory: ${tmp_dir}" +rm -rf ${tmp_dir} +mkdir -p ${tmp_dir} + +echo "Clone dev dist repository" +svn \ + co \ + https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-${version}-rc${rc} \ + ${tmp_dir}/dev + +echo "Clone release dist repository" +svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release + +echo "Copy ${version}-rc${rc} to release working copy" +release_version=arrow-object-store-rs-${version} +mkdir -p ${tmp_dir}/release/${release_version} +cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ +svn add ${tmp_dir}/release/${release_version} + +echo "Commit release" +svn ci -m "Apache Arrow Rust Object Store ${version}" ${tmp_dir}/release + +echo "Clean up" +rm -rf ${tmp_dir} + +echo "Success!" +echo "The release is available here:" +echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh new file mode 100755 index 000000000000..ebd50df7ffc0 --- /dev/null +++ b/object_store/dev/release/update_change_log.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# invokes the changelog generator from +# https://github.com/github-changelog-generator/github-changelog-generator +# +# With the config located in +# arrow-rs/object_store/.github_changelog_generator +# +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh + +set -e + +SINCE_TAG="object_store_0.3.0" +FUTURE_RELEASE="object_store_0.4.0" + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" + +# remove license header so github-changelog-generator has a clean base to append +sed -i.bak '1,18d' "${OUTPUT_PATH}" + +# use exclude-tags-regex to filter out tags used for arrow +# crates and only look at tags that begin with `object_store_` +pushd "${SOURCE_TOP_DIR}" +docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ + --user apache \ + --project arrow-rs \ + --cache-file=.githubchangeloggenerator.cache \ + --cache-log=.githubchangeloggenerator.cache.log \ + --http-cache \ + --max-issues=300 \ + --exclude-tags-regex "^\d+\.\d+\.\d+$" \ + --since-tag ${SINCE_TAG} \ + --future-release ${FUTURE_RELEASE} + +sed -i.bak "s/\\\n/\n\n/" "${OUTPUT_PATH}" + +# Put license header back on +echo ' +' | cat - "${OUTPUT_PATH}" > "${OUTPUT_PATH}".tmp +mv "${OUTPUT_PATH}".tmp "${OUTPUT_PATH}" diff --git a/object_store/dev/release/verify-release-candidate.sh b/object_store/dev/release/verify-release-candidate.sh new file mode 100755 index 000000000000..06a5d8bcb838 --- /dev/null +++ b/object_store/dev/release/verify-release-candidate.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -e +set -x +set -o pipefail + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' + +download_dist_file() { + curl \ + --silent \ + --show-error \ + --fail \ + --location \ + --remote-name $ARROW_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-arrow-object-store-rs-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +if type shasum >/dev/null 2>&1; then + sha256_verify="shasum -a 256 -c" + sha512_verify="shasum -a 512 -c" +else + sha256_verify="sha256sum -c" + sha512_verify="sha512sum -c" +fi + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.sha256 + download_rc_file ${dist_name}.tar.gz.sha512 + gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz + ${sha256_verify} ${dist_name}.tar.gz.sha256 + ${sha512_verify} ${dist_name}.tar.gz.sha512 +} + +setup_tempdir() { + cleanup() { + if [ "${TEST_SUCCESS}" = "yes" ]; then + rm -fr "${ARROW_TMPDIR}" + else + echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." + fi + } + + if [ -z "${ARROW_TMPDIR}" ]; then + # clean up automatically if ARROW_TMPDIR is not defined + ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${ARROW_TMPDIR}" + fi +} + +test_source_distribution() { + # install rust toolchain in a similar fashion like test-miniconda + export RUSTUP_HOME=$PWD/test-rustup + export CARGO_HOME=$PWD/test-rustup + + curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path + + export PATH=$RUSTUP_HOME/bin:$PATH + source $RUSTUP_HOME/env + + # build and test rust + cargo build + cargo test --all + + # verify that the crate can be published to crates.io + cargo publish --dry-run +} + +TEST_SUCCESS=no + +setup_tempdir "arrow-${VERSION}" +echo "Working in sandbox ${ARROW_TMPDIR}" +cd ${ARROW_TMPDIR} + +dist_name="apache-arrow-object-store-rs-${VERSION}" +import_gpg_keys +fetch_archive ${dist_name} +tar xf ${dist_name}.tar.gz +pushd ${dist_name} +test_source_distribution +popd + +TEST_SUCCESS=yes +echo 'Release candidate looks good!' +exit 0 diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs index d59f48bcefe0..bcb294c00373 100644 --- a/object_store/src/aws.rs +++ b/object_store/src/aws.rs @@ -48,6 +48,7 @@ use futures::{ Future, Stream, StreamExt, TryStreamExt, }; use hyper::client::Builder as HyperBuilder; +use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC}; use rusoto_core::ByteStream; use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; use rusoto_s3::S3; @@ -62,6 +63,17 @@ use tokio::io::AsyncWrite; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tracing::{debug, warn}; +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// The maximum number of times a request will be retried in the case of an AWS server error pub const MAX_NUM_RETRIES: u32 = 3; @@ -541,9 +553,15 @@ impl ObjectStore for AmazonS3 { let to = to.as_ref(); let bucket_name = self.bucket_name.clone(); + let copy_source = format!( + "{}/{}", + &bucket_name, + percent_encode(from.as_ref(), &STRICT_PATH_ENCODE_SET) + ); + let request_factory = move || rusoto_s3::CopyObjectRequest { bucket: bucket_name.clone(), - copy_source: format!("{}/{}", &bucket_name, from), + copy_source, key: to.to_string(), ..Default::default() }; @@ -1009,34 +1027,6 @@ where } } -impl Error { - #[cfg(test)] - fn s3_error_due_to_credentials(&self) -> bool { - use rusoto_core::RusotoError; - use Error::*; - - matches!( - self, - UnableToPutData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToGetData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToDeleteData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToListData { - source: RusotoError::Credentials(_), - bucket: _, - } - ) - } -} - struct S3MultiPartUpload { bucket: String, key: String, @@ -1168,9 +1158,6 @@ mod tests { use bytes::Bytes; use std::env; - type TestError = Box; - type Result = std::result::Result; - const NON_EXISTENT_NAME: &str = "nonexistentname"; // Helper macro to skip tests if TEST_INTEGRATION and the AWS @@ -1250,32 +1237,16 @@ mod tests { }}; } - fn check_credentials(r: Result) -> Result { - if let Err(e) = &r { - let e = &**e; - if let Some(e) = e.downcast_ref::() { - if e.s3_error_due_to_credentials() { - eprintln!( - "Try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY \ - environment variables" - ); - } - } - } - - r - } - #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); let integration = config.build().unwrap(); - check_credentials(put_get_delete_list(&integration).await).unwrap(); - check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); - check_credentials(list_with_delimiter(&integration).await).unwrap(); - check_credentials(rename_and_copy(&integration).await).unwrap(); - check_credentials(stream_get(&integration).await).unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; } #[tokio::test] diff --git a/object_store/src/azure.rs b/object_store/src/azure.rs index 0d5f2fa7d305..6a5f537997cd 100644 --- a/object_store/src/azure.rs +++ b/object_store/src/azure.rs @@ -470,14 +470,15 @@ impl ObjectStore for MicrosoftAzure { impl MicrosoftAzure { /// helper function to create a source url for copy function - fn get_copy_from_url(&self, from: &Path) -> Result { - Ok(reqwest::Url::parse(&format!( - "{}/{}/{}", - &self.blob_base_url, self.container_name, from - )) - .context(UnableToParseUrlSnafu { - container: &self.container_name, - })?) + fn get_copy_from_url(&self, from: &Path) -> Result { + let mut url = + Url::parse(&format!("{}/{}", &self.blob_base_url, self.container_name)) + .context(UnableToParseUrlSnafu { + container: &self.container_name, + })?; + + url.path_segments_mut().unwrap().extend(from.parts()); + Ok(url) } async fn list_impl( @@ -857,10 +858,10 @@ mod tests { async fn azure_blob_test() { let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; } } diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs index f9cb2b207a68..0dc5a956ac08 100644 --- a/object_store/src/gcp.rs +++ b/object_store/src/gcp.rs @@ -1003,14 +1003,14 @@ mod test { async fn gcs_test() { let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; if integration.client.base_url == default_gcs_base_url() { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 - stream_get(&integration).await.unwrap(); + stream_get(&integration).await; } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 08634e2feaf7..f7adedb2682c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -176,7 +176,9 @@ mod multipart; mod util; use crate::path::Path; -use crate::util::{collect_bytes, maybe_spawn_blocking}; +use crate::util::{ + coalesce_ranges, collect_bytes, maybe_spawn_blocking, OBJECT_STORE_COALESCE_DEFAULT, +}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -231,6 +233,21 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range async fn get_range(&self, location: &Path, range: Range) -> Result; + /// Return the bytes that are stored at the specified location + /// in the given byte ranges + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + coalesce_ranges( + ranges, + |range| self.get_range(location, range), + OBJECT_STORE_COALESCE_DEFAULT, + ) + .await + } + /// Return the metadata for the specified location async fn head(&self, location: &Path) -> Result; @@ -296,7 +313,7 @@ pub struct ListResult { } /// The metadata that describes an object. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ObjectMeta { /// The full path to the object pub location: Path, @@ -478,15 +495,12 @@ mod tests { use crate::test_util::flatten_list_stream; use tokio::io::AsyncWriteExt; - type Error = Box; - type Result = std::result::Result; - - pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { let store_str = storage.to_string(); delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -497,16 +511,16 @@ mod tests { let data = Bytes::from("arbitrary data"); let expected_data = data.clone(); - storage.put(&location, data).await?; + storage.put(&location, data).await.unwrap(); let root = Path::from("/"); // List everything - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // Should behave the same as no prefix - let content_list = flatten_list_stream(storage, Some(&root)).await?; + let content_list = flatten_list_stream(storage, Some(&root)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List with delimiter @@ -523,15 +537,15 @@ mod tests { // List everything starting with a prefix that should return results let prefix = Path::from("test_dir"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List everything starting with a prefix that shouldn't return results let prefix = Path::from("something"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert!(content_list.is_empty()); - let read_data = storage.get(&location).await?.bytes().await?; + let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(&*read_data, expected_data); // Test range request @@ -555,14 +569,20 @@ mod tests { // Should be a non-fatal error out_of_range_result.unwrap_err(); + + let ranges = vec![0..1, 2..3, 0..5]; + let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); + for (range, bytes) in ranges.iter().zip(bytes) { + assert_eq!(bytes, expected_data.slice(range.clone())) + } } - let head = storage.head(&location).await?; + let head = storage.head(&location).await.unwrap(); assert_eq!(head.size, expected_data.len()); - storage.delete(&location).await?; + storage.delete(&location).await.unwrap(); - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); let err = storage.get(&location).await.unwrap_err(); @@ -635,13 +655,54 @@ mod tests { assert_eq!(files, vec![emoji_file.clone()]); + let dst = Path::from("foo.parquet"); + storage.copy(&emoji_file, &dst).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + storage.delete(&emoji_file).await.unwrap(); + storage.delete(&dst).await.unwrap(); let files = flatten_list_stream(storage, Some(&emoji_prefix)) .await .unwrap(); assert!(files.is_empty()); - Ok(()) + // Test handling of paths containing percent-encoded sequences + + // "HELLO" percent encoded + let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); + let path = hello_prefix.child("foo.parquet"); + + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + let files = flatten_list_stream(storage, Some(&hello_prefix)) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + + // Cannot list by decoded representation + let files = flatten_list_stream(storage, Some(&Path::from("HELLO"))) + .await + .unwrap(); + assert!(files.is_empty()); + + // Cannot access by decoded representation + let err = storage + .head(&Path::from("HELLO/foo.parquet")) + .await + .unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + storage.delete(&path).await.unwrap(); + + // Can also write non-percent encoded sequences + let path = Path::parse("%Q.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![path.clone()]); + + storage.delete(&path).await.unwrap(); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { @@ -650,15 +711,15 @@ mod tests { .collect() } - pub(crate) async fn stream_get(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn stream_get(storage: &DynObjectStore) { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage let data = get_vec_of_bytes(5_000_000, 10); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } // Object should not yet exist in store @@ -669,26 +730,29 @@ mod tests { crate::Error::NotFound { .. } )); - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage let data = get_vec_of_bytes(5_000, 5); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); - let (upload_id, writer) = storage.put_multipart(&location).await?; + let (upload_id, writer) = storage.put_multipart(&location).await.unwrap(); drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( @@ -697,30 +761,29 @@ mod tests { )); // We can abort an in-progress write - let (upload_id, mut writer) = storage.put_multipart(&location).await?; + let (upload_id, mut writer) = storage.put_multipart(&location).await.unwrap(); if let Some(chunk) = data.get(0) { - writer.write_all(chunk).await?; - let _ = writer.write(chunk).await?; + writer.write_all(chunk).await.unwrap(); + let _ = writer.write(chunk).await.unwrap(); } drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( get_res.unwrap_err(), crate::Error::NotFound { .. } )); - - Ok(()) } - pub(crate) async fn list_uses_directories_correctly( - storage: &DynObjectStore, - ) -> Result<()> { + pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) { delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -731,25 +794,23 @@ mod tests { let location2 = Path::from("foo.bar/y.json"); let data = Bytes::from("arbitrary data"); - storage.put(&location1, data.clone()).await?; - storage.put(&location2, data).await?; + storage.put(&location1, data.clone()).await.unwrap(); + storage.put(&location2, data).await.unwrap(); let prefix = Path::from("foo"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location1.clone()]); let prefix = Path::from("foo/x"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[]); - - Ok(()) } - pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { delete_fixtures(storage).await; // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); // ==================== do: create files ==================== @@ -811,10 +872,8 @@ mod tests { } // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); - - Ok(()) } pub(crate) async fn get_nonexistent_object( @@ -830,7 +889,7 @@ mod tests { storage.get(&location).await?.bytes().await } - pub(crate) async fn rename_and_copy(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn rename_and_copy(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -838,29 +897,27 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy() make both objects identical - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.copy(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.copy(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); // rename() copies contents and deletes original - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.rename(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.rename(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } - pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -868,8 +925,8 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy_if_not_exists() errors if destination already exists - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); let result = storage.copy_if_not_exists(&path1, &path2).await; assert!(result.is_err()); assert!(matches!( @@ -878,19 +935,17 @@ mod tests { )); // copy_if_not_exists() copies contents and allows deleting original - storage.delete(&path2).await?; - storage.copy_if_not_exists(&path1, &path2).await?; - storage.delete(&path1).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.delete(&path2).await.unwrap(); + storage.copy_if_not_exists(&path1, &path2).await.unwrap(); + storage.delete(&path1).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } async fn delete_fixtures(storage: &DynObjectStore) { diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index fd21ccb58d7b..09c88aa2a4bc 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -110,6 +110,15 @@ impl ObjectStore for LimitStore { self.inner.get_range(location, range).await } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.get_ranges(location, ranges).await + } + async fn head(&self, location: &Path) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.head(location).await @@ -237,11 +246,11 @@ mod tests { let memory = InMemory::new(); let integration = LimitStore::new(memory, max_requests); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; let mut streams = Vec::with_capacity(max_requests); for _ in 0..max_requests { diff --git a/object_store/src/local.rs b/object_store/src/local.rs index c3f54e0c6dc1..fd3c3592ab56 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -18,7 +18,7 @@ //! An object store implementation for a local filesystem use crate::{ maybe_spawn_blocking, - path::{filesystem_path_to_url, Path}, + path::{absolute_path_to_url, Path}, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; @@ -129,6 +129,12 @@ pub(crate) enum Error { path: String, source: io::Error, }, + + #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] + UnableToCanonicalize { + path: PathBuf, + source: io::Error, + }, } impl From for super::Error { @@ -214,17 +220,24 @@ impl LocalFileSystem { } /// Create new filesystem storage with `prefix` applied to all paths + /// + /// Returns an error if the path does not exist + /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { + let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { + path: prefix.as_ref(), + })?; + Ok(Self { config: Arc::new(Config { - root: filesystem_path_to_url(prefix)?, + root: absolute_path_to_url(path)?, }), }) } } impl Config { - /// Return filesystem path of the given location + /// Return an absolute filesystem path of the given location fn path_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() @@ -238,8 +251,9 @@ impl Config { .map_err(|_| Error::InvalidUrl { url }.into()) } + /// Resolves the provided absolute filesystem path to a [`Path`] prefix fn filesystem_to_path(&self, location: &std::path::Path) -> Result { - Ok(Path::from_filesystem_path_with_base( + Ok(Path::from_absolute_path_with_base( location, Some(&self.root), )?) @@ -322,26 +336,25 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let mut file = open_file(&path)?; - let to_read = range.end - range.start; - file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path: &path })?; - - let mut buf = Vec::with_capacity(to_read); - let read = file - .take(to_read as u64) - .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path: &path })?; - - ensure!( - read == to_read, - OutOfRangeSnafu { - path: &path, - expected: to_read, - actual: read - } - ); + read_range(&mut file, &path, range) + }) + .await + } - Ok(buf.into()) + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let path = self.config.path_to_filesystem(location)?; + let ranges = ranges.to_vec(); + maybe_spawn_blocking(move || { + // Vectored IO might be faster + let mut file = open_file(&path)?; + ranges + .into_iter() + .map(|r| read_range(&mut file, &path, r)) + .collect() }) .await } @@ -750,6 +763,28 @@ impl AsyncWrite for LocalUpload { } } +fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { + let to_read = range.end - range.start; + file.seek(SeekFrom::Start(range.start as u64)) + .context(SeekSnafu { path })?; + + let mut buf = Vec::with_capacity(to_read); + let read = file + .take(to_read as u64) + .read_to_end(&mut buf) + .context(UnableToReadBytesSnafu { path })?; + + ensure!( + read == to_read, + OutOfRangeSnafu { + path, + expected: to_read, + actual: read + } + ); + Ok(buf.into()) +} + fn open_file(path: &PathBuf) -> Result { let file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { @@ -888,12 +923,12 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[test] @@ -901,10 +936,10 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); futures::executor::block_on(async move { - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + stream_get(&integration).await; }); } @@ -1212,7 +1247,7 @@ mod tests { .to_string(); assert!( - err.contains("Invalid path segment - got \"💀\" expected: \"%F0%9F%92%80\""), + err.contains("Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""), "{}", err ); @@ -1247,4 +1282,33 @@ mod tests { 0 ); } + + #[tokio::test] + async fn filesystem_filename_with_percent() { + let temp_dir = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(temp_dir.path()).unwrap(); + let filename = "L%3ABC.parquet"; + + std::fs::write(temp_dir.path().join(filename), "foo").unwrap(); + + let list_stream = integration.list(None).await.unwrap(); + let res: Vec<_> = list_stream.try_collect().await.unwrap(); + assert_eq!(res.len(), 1); + assert_eq!(res[0].location.as_ref(), filename); + + let res = integration.list_with_delimiter(None).await.unwrap(); + assert_eq!(res.objects.len(), 1); + assert_eq!(res.objects[0].location.as_ref(), filename); + } + + #[tokio::test] + async fn relative_paths() { + LocalFileSystem::new_with_prefix(".").unwrap(); + LocalFileSystem::new_with_prefix("..").unwrap(); + LocalFileSystem::new_with_prefix("../..").unwrap(); + + let integration = LocalFileSystem::new(); + let path = Path::from_filesystem_path(".").unwrap(); + integration.list_with_delimiter(Some(&path)).await.unwrap(); + } } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index dc3967d9915f..e4be5b2afddf 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -128,6 +128,22 @@ impl ObjectStore for InMemory { Ok(data.slice(range)) } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let data = self.get_bytes(location).await?; + ranges + .iter() + .map(|range| { + ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.start <= range.end, BadRangeSnafu); + Ok(data.slice(range.clone())) + }) + .collect() + } + async fn head(&self, location: &Path) -> Result { let last_modified = Utc::now(); let bytes = self.get_bytes(location).await?; @@ -305,12 +321,12 @@ mod tests { async fn in_memory_test() { let integration = InMemory::new(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[tokio::test] diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 38b7eb3e0273..e5a7b6443bb1 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -126,7 +126,6 @@ pub enum Error { /// Path::parse("..").unwrap_err(); /// Path::parse("/foo//").unwrap_err(); /// Path::parse("😀").unwrap_err(); -/// Path::parse("%Q").unwrap_err(); /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt @@ -163,23 +162,38 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path contains illegal - /// character sequences as defined by [`Path::parse`] + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`] or does not exist + /// + /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path( path: impl AsRef, ) -> Result { - Self::from_filesystem_path_with_base(path, None) + let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { + path: path.as_ref(), + })?; + + Self::from_absolute_path(absolute) + } + + /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root + /// + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`], or `base` is not an absolute path + pub fn from_absolute_path(path: impl AsRef) -> Result { + Self::from_absolute_path_with_base(path, None) } /// Convert a filesystem path to a [`Path`] relative to the provided base /// /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path` - pub(crate) fn from_filesystem_path_with_base( + /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path`, + /// or `base` is not an absolute path + pub(crate) fn from_absolute_path_with_base( path: impl AsRef, base: Option<&Url>, ) -> Result { - let url = filesystem_path_to_url(path)?; + let url = absolute_path_to_url(path)?; let path = match base { Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { Error::PrefixMismatch { @@ -294,8 +308,8 @@ where } } -/// Given a filesystem path convert it to a URL representation -pub(crate) fn filesystem_path_to_url( +/// Given an absolute filesystem path convert it to a URL representation without canonicalization +pub(crate) fn absolute_path_to_url( path: impl AsRef, ) -> Result { Url::from_file_path(&path).map_err(|_| Error::InvalidPath { diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index e73b184fc15c..9da4815712db 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; +use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; @@ -23,11 +23,15 @@ use snafu::Snafu; /// Error returned by [`PathPart::parse`] #[derive(Debug, Snafu)] -#[snafu(display("Invalid path segment - got \"{}\" expected: \"{}\"", actual, expected))] +#[snafu(display( + "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", + illegal, + segment +))] #[allow(missing_copy_implementations)] pub struct InvalidPart { - actual: String, - expected: String, + segment: String, + illegal: String, } /// The PathPart type exists to validate the directory/file names that form part @@ -43,21 +47,40 @@ pub struct PathPart<'a> { impl<'a> PathPart<'a> { /// Parse the provided path segment as a [`PathPart`] returning an error if invalid pub fn parse(segment: &'a str) -> Result { - let decoded: Cow<'a, [u8]> = percent_decode(segment.as_bytes()).into(); - let part = PathPart::from(decoded.as_ref()); - if segment != part.as_ref() { + if segment == "." || segment == ".." { return Err(InvalidPart { - actual: segment.to_string(), - expected: part.raw.to_string(), + segment: segment.to_string(), + illegal: segment.to_string(), }); } + for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { + // A percent character is always valid, even if not + // followed by a valid 2-digit hex code + // https://url.spec.whatwg.org/#percent-encoded-bytes + if b == b'%' { + continue; + } + + if !b.is_ascii() || should_percent_encode(b) { + return Err(InvalidPart { + segment: segment.to_string(), + // This is correct as only single byte characters up to this point + illegal: segment.chars().nth(idx).unwrap().to_string(), + }); + } + } + Ok(Self { raw: segment.into(), }) } } +fn should_percent_encode(c: u8) -> bool { + percent_encode(&[c], INVALID).next().unwrap().len() != 1 +} + /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy @@ -145,4 +168,18 @@ mod tests { let part: PathPart<'_> = "..".into(); assert_eq!(part.raw, "%2E%2E"); } + + #[test] + fn path_part_parse() { + PathPart::parse("foo").unwrap(); + PathPart::parse("foo/bar").unwrap_err(); + + // Test percent-encoded path + PathPart::parse("foo%2Fbar").unwrap(); + PathPart::parse("L%3ABC.parquet").unwrap(); + + // Test path containing bad escape sequence + PathPart::parse("%Z").unwrap(); + PathPart::parse("%%").unwrap(); + } } diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 6789f0e68df9..90f427cc2651 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -197,7 +197,7 @@ impl ObjectStore for ThrottledStore { async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); - let sleep_duration = config.wait_delete_per_call + let sleep_duration = config.wait_get_per_call + config.wait_get_per_byte * (range.end - range.start) as u32; sleep(sleep_duration).await; @@ -205,6 +205,22 @@ impl ObjectStore for ThrottledStore { self.inner.get_range(location, range).await } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let config = self.config(); + + let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); + let sleep_duration = + config.wait_get_per_call + config.wait_get_per_byte * total_bytes as u32; + + sleep(sleep_duration).await; + + self.inner.get_ranges(location, ranges).await + } + async fn head(&self, location: &Path) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.head(location).await @@ -260,11 +276,23 @@ impl ObjectStore for ThrottledStore { self.inner.copy(from, to).await } + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename(from, to).await + } + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { sleep(self.config().wait_put_per_call).await; self.inner.copy_if_not_exists(from, to).await } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename_if_not_exists(from, to).await + } } /// Saturated `usize` to `u32` cast. @@ -308,11 +336,11 @@ mod tests { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default()); - put_get_delete_list(&store).await.unwrap(); - list_uses_directories_correctly(&store).await.unwrap(); - list_with_delimiter(&store).await.unwrap(); - rename_and_copy(&store).await.unwrap(); - copy_if_not_exists(&store).await.unwrap(); + put_get_delete_list(&store).await; + list_uses_directories_correctly(&store).await; + list_with_delimiter(&store).await; + rename_and_copy(&store).await; + copy_if_not_exists(&store).await; } #[tokio::test] diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 4f3ed86fdc69..46e9e9ed8771 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -71,3 +71,98 @@ where Err(_) => f(), } } + +/// Range requests with a gap less than or equal to this, +/// will be coalesced into a single request by [`coalesce_ranges`] +pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; + +/// Takes a function to fetch ranges and coalesces adjacent ranges if they are +/// less than `coalesce` bytes apart. Out of order `ranges` are not coalesced +pub async fn coalesce_ranges( + ranges: &[std::ops::Range], + mut fetch: F, + coalesce: usize, +) -> Result> +where + F: Send + FnMut(std::ops::Range) -> Fut, + Fut: std::future::Future> + Send, +{ + let mut ret = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(ranges[start_idx].end) + .map(|delta| delta <= coalesce) + .unwrap_or(false) + { + end_idx += 1; + } + + let start = ranges[start_idx].start; + let end = ranges[end_idx - 1].end; + let bytes = fetch(start..end).await?; + for range in ranges.iter().take(end_idx).skip(start_idx) { + ret.push(bytes.slice(range.start - start..range.end - start)) + } + start_idx = end_idx; + end_idx += 1; + } + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ops::Range; + + #[tokio::test] + async fn test_coalesce_ranges() { + let do_fetch = |ranges: Vec>, coalesce: usize| async move { + let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); + let src: Vec<_> = (0..max).map(|x| x as u8).collect(); + + let mut fetches = vec![]; + let coalesced = coalesce_ranges( + &ranges, + |range| { + fetches.push(range.clone()); + futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) + }, + coalesce, + ) + .await + .unwrap(); + + assert_eq!(ranges.len(), coalesced.len()); + for (range, bytes) in ranges.iter().zip(coalesced) { + assert_eq!(bytes.as_ref(), &src[range.clone()]); + } + fetches + }; + + let fetches = do_fetch(vec![], 0).await; + assert_eq!(fetches, vec![]); + + let fetches = do_fetch(vec![0..3], 0).await; + assert_eq!(fetches, vec![0..3]); + + let fetches = do_fetch(vec![0..2, 3..5], 0).await; + assert_eq!(fetches, vec![0..2, 3..5]); + + let fetches = do_fetch(vec![0..1, 1..2], 0).await; + assert_eq!(fetches, vec![0..2]); + + let fetches = do_fetch(vec![0..1, 2..72], 1).await; + assert_eq!(fetches, vec![0..72]); + + let fetches = do_fetch(vec![0..1, 56..72, 73..75], 1).await; + assert_eq!(fetches, vec![0..1, 56..75]); + + let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; + assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + } +} diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 0324ecccc515..5a8e4c485328 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -30,7 +30,7 @@ edition = "2021" rust-version = "1.62" [dependencies] -ahash = "0.7" +ahash = "0.8" parquet-format = { version = "4.0.0", default-features = false } bytes = { version = "1.1", default-features = false, features = ["std"] } byteorder = { version = "1", default-features = false } @@ -43,7 +43,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "19.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "20.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -63,7 +63,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "19.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } +arrow = { path = "../arrow", version = "20.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet/README.md b/parquet/README.md index fbb6e3e1b5d5..689a664b6326 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -19,17 +19,38 @@ # Apache Parquet Official Native Rust Implementation -[![Crates.io](https://img.shields.io/crates/v/parquet.svg)](https://crates.io/crates/parquet) +[![crates.io](https://img.shields.io/crates/v/parquet.svg)](https://crates.io/crates/parquet) +[![docs.rs](https://img.shields.io/docsrs/parquet.svg)](https://docs.rs/parquet/latest/parquet/) This crate contains the official Native Rust implementation of [Apache Parquet](https://parquet.apache.org/), which is part of the [Apache Arrow](https://arrow.apache.org/) project. See [crate documentation](https://docs.rs/parquet/latest/parquet/) for examples and the full API. -## Rust Version Compatbility +## Rust Version Compatibility This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. -## Features +## Versioning / Releases + +The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. + +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `19.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. + +## Feature Flags + +The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`: + +- `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet +- `async` - support `async` APIs for reading parquet +- `json` - support for reading / writing `json` data to / from parquet +- `brotli` (default) - support for parquet using `brotli` compression +- `flate2` (default) - support for parquet using `gzip` compression +- `lz4` (default) - support for parquet using `lz4` compression +- `zstd` (default) - support for parquet using `zstd` compression +- `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/master/parquet/src/bin) +- `experimental` - Experimental APIs which may change, even between minor releases + +## Parquet Feature Status - [x] All encodings supported - [x] All compression codecs supported diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index dc2ed8355659..a3c904505c25 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -300,6 +300,26 @@ fn bench_array_reader(mut array_reader: Box) -> usize { total_count } +fn bench_array_reader_skip(mut array_reader: Box) -> usize { + // test procedure: read data in batches of 8192 until no more data + let mut total_count = 0; + let mut skip = false; + let mut array_len; + loop { + if skip { + array_len = array_reader.skip_records(BATCH_SIZE).unwrap(); + } else { + let array = array_reader.next_batch(BATCH_SIZE); + array_len = array.unwrap().len(); + } + total_count += array_len; + skip = !skip; + if array_len < BATCH_SIZE { + break; + } + } + total_count +} fn create_primitive_array_reader( page_iterator: impl PageIterator + 'static, column_desc: ColumnDescPtr, @@ -445,6 +465,39 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); + // binary packed skip , no NULLs + let data = build_encoded_primitive_page_iterator::( + schema.clone(), + mandatory_column_desc.clone(), + 0.0, + Encoding::DELTA_BINARY_PACKED, + ); + group.bench_function("binary packed skip, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_primitive_array_reader( + data.clone(), + mandatory_column_desc.clone(), + ); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let data = build_encoded_primitive_page_iterator::( + schema.clone(), + optional_column_desc.clone(), + 0.0, + Encoding::DELTA_BINARY_PACKED, + ); + group.bench_function("binary packed skip, optional, no NULLs", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), optional_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary packed, half NULLs let data = build_encoded_primitive_page_iterator::( schema.clone(), diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 25ff1ca90dc6..ddca1e53c6de 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -92,6 +92,25 @@ fn create_string_bench_batch( )?) } +fn create_string_dictionary_bench_batch( + size: usize, + null_density: f32, + true_density: f32, +) -> Result { + let fields = vec![Field::new( + "_1", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + )]; + let schema = Schema::new(fields); + Ok(create_random_batch( + Arc::new(schema), + size, + null_density, + true_density, + )?) +} + fn create_string_bench_batch_non_null( size: usize, null_density: f32, @@ -346,6 +365,18 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + let batch = create_string_dictionary_bench_batch(4096, 0.25, 0.75).unwrap(); + group.throughput(Throughput::Bytes( + batch + .columns() + .iter() + .map(|f| f.get_array_memory_size() as u64) + .sum(), + )); + group.bench_function("4096 values string dictionary", |b| { + b.iter(|| write_batch(&batch).unwrap()) + }); + let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index d9c1bedb246c..e389158a1931 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -39,20 +39,18 @@ use crate::data_type::{ Int64Type, Int96Type, }; use crate::errors::Result; -use crate::schema::types::{ColumnDescriptor, ColumnPath, SchemaDescPtr, Type}; +use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; /// Create array reader from parquet schema, projection mask, and parquet file reader. pub fn build_array_reader( - parquet_schema: SchemaDescPtr, arrow_schema: SchemaRef, mask: ProjectionMask, - row_groups: Box, + row_groups: &dyn RowGroupCollection, ) -> Result> { - let field = - convert_schema(parquet_schema.as_ref(), mask, Some(arrow_schema.as_ref()))?; + let field = convert_schema(&row_groups.schema(), mask, Some(arrow_schema.as_ref()))?; match &field { - Some(field) => build_reader(field, row_groups.as_ref()), + Some(field) => build_reader(field, row_groups), None => Ok(make_empty_array_reader(row_groups.num_rows())), } } @@ -104,13 +102,11 @@ fn build_list_reader( let data_type = field.arrow_type.clone(); let item_reader = build_reader(&children[0], row_groups)?; - let item_type = item_reader.get_data_type().clone(); match is_large { false => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -118,7 +114,6 @@ fn build_list_reader( true => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -318,7 +313,7 @@ mod tests { use super::*; use crate::arrow::parquet_to_arrow_schema; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; use arrow::datatypes::Field; use std::sync::Arc; @@ -336,13 +331,8 @@ mod tests { ) .unwrap(); - let array_reader = build_array_reader( - file_reader.metadata().file_metadata().schema_descr_ptr(), - Arc::new(arrow_schema), - mask, - Box::new(file_reader), - ) - .unwrap(); + let array_reader = + build_array_reader(Arc::new(arrow_schema), mask, &file_reader).unwrap(); // Create arrow types let arrow_type = DataType::Struct(vec![Field::new( diff --git a/parquet/src/arrow/array_reader/complex_object_array.rs b/parquet/src/arrow/array_reader/complex_object_array.rs index 79b53733176c..4f958fea446f 100644 --- a/parquet/src/arrow/array_reader/complex_object_array.rs +++ b/parquet/src/arrow/array_reader/complex_object_array.rs @@ -197,19 +197,24 @@ where } fn skip_records(&mut self, num_records: usize) -> Result { - match self.column_reader.as_mut() { - Some(reader) => reader.skip_records(num_records), - None => { - if self.next_column_reader()? { - self.column_reader - .as_mut() - .unwrap() - .skip_records(num_records) - } else { - Ok(0) - } + let mut num_read = 0; + while (self.column_reader.is_some() || self.next_column_reader()?) + && num_read < num_records + { + let remain_to_skip = num_records - num_read; + let skip = self + .column_reader + .as_mut() + .unwrap() + .skip_records(remain_to_skip)?; + num_read += skip; + // skip < remain_to_skip means end of row group + // self.next_column_reader() == false means end of file + if skip < remain_to_skip && !self.next_column_reader()? { + break; } } + Ok(num_read) } fn get_def_levels(&self) -> Option<&[i16]> { diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index c245c61312fc..d2fa94611906 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -34,7 +34,6 @@ use std::sync::Arc; pub struct ListArrayReader { item_reader: Box, data_type: ArrowType, - item_type: ArrowType, /// The definition level at which this list is not null def_level: i16, /// The repetition level that corresponds to a new value in this array @@ -49,7 +48,6 @@ impl ListArrayReader { pub fn new( item_reader: Box, data_type: ArrowType, - item_type: ArrowType, def_level: i16, rep_level: i16, nullable: bool, @@ -57,7 +55,6 @@ impl ListArrayReader { Self { item_reader, data_type, - item_type, def_level, rep_level, nullable, @@ -268,10 +265,7 @@ mod tests { item_nullable: bool, ) -> ArrowType { let field = Box::new(Field::new("item", data_type, item_nullable)); - match OffsetSize::IS_LARGE { - true => ArrowType::LargeList(field), - false => ArrowType::List(field), - } + GenericListArray::::DATA_TYPE_CONSTRUCTOR(field) } fn downcast( @@ -307,13 +301,13 @@ mod tests { // ] let l3_item_type = ArrowType::Int32; - let l3_type = list_type::(l3_item_type.clone(), true); + let l3_type = list_type::(l3_item_type, true); let l2_item_type = l3_type.clone(); - let l2_type = list_type::(l2_item_type.clone(), true); + let l2_type = list_type::(l2_item_type, true); let l1_item_type = l2_type.clone(); - let l1_type = list_type::(l1_item_type.clone(), false); + let l1_type = list_type::(l1_item_type, false); let leaf = PrimitiveArray::::from_iter(vec![ Some(1), @@ -390,7 +384,6 @@ mod tests { let l3 = ListArrayReader::::new( Box::new(item_array_reader), l3_type, - l3_item_type, 5, 3, true, @@ -399,7 +392,6 @@ mod tests { let l2 = ListArrayReader::::new( Box::new(l3), l2_type, - l2_item_type, 3, 2, false, @@ -408,7 +400,6 @@ mod tests { let mut l1 = ListArrayReader::::new( Box::new(l2), l1_type, - l1_item_type, 2, 1, true, @@ -459,7 +450,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 1, 1, false, @@ -512,7 +502,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 2, 1, true, @@ -593,13 +582,9 @@ mod tests { let schema = file_metadata.schema_descr_ptr(); let mask = ProjectionMask::leaves(&schema, vec![0]); - let mut array_reader = build_array_reader( - schema, - Arc::new(arrow_schema), - mask, - Box::new(file_reader), - ) - .unwrap(); + let mut array_reader = + build_array_reader(Arc::new(arrow_schema), mask, &file_reader) + .unwrap(); let batch = array_reader.next_batch(100).unwrap(); assert_eq!(batch.data_type(), array_reader.get_data_type()); diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 83ba63ca1706..3ba7f6960ec3 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -32,6 +32,7 @@ pub struct MapArrayReader { value_reader: Box, data_type: ArrowType, map_def_level: i16, + #[allow(unused)] map_rep_level: i16, } @@ -47,6 +48,7 @@ impl MapArrayReader { key_reader, value_reader, data_type, + // These are the wrong way round https://github.com/apache/arrow-rs/issues/1699 map_def_level: rep_level, map_rep_level: def_level, } diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index d7665ef0f6b2..54c45a336a37 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -100,7 +100,7 @@ pub trait ArrayReader: Send { /// A collection of row groups pub trait RowGroupCollection { /// Get schema of parquet file. - fn schema(&self) -> Result; + fn schema(&self) -> SchemaDescPtr; /// Get the numer of rows in this collection fn num_rows(&self) -> usize; @@ -110,8 +110,8 @@ pub trait RowGroupCollection { } impl RowGroupCollection for Arc { - fn schema(&self) -> Result { - Ok(self.metadata().file_metadata().schema_descr_ptr()) + fn schema(&self) -> SchemaDescPtr { + self.metadata().file_metadata().schema_descr_ptr() } fn num_rows(&self) -> usize { diff --git a/parquet/src/arrow/array_reader/null_array.rs b/parquet/src/arrow/array_reader/null_array.rs index 682d15f8a177..405633f0a823 100644 --- a/parquet/src/arrow/array_reader/null_array.rs +++ b/parquet/src/arrow/array_reader/null_array.rs @@ -39,7 +39,6 @@ where pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } @@ -50,14 +49,13 @@ where { /// Construct null array reader. pub fn new(pages: Box, column_desc: ColumnDescPtr) -> Result { - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type: ArrowType::Null, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 45614d50941c..35f523e3d0d7 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -25,7 +25,7 @@ use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow::array::{ - ArrayDataBuilder, ArrayRef, BasicDecimalArray, BooleanArray, BooleanBufferBuilder, + ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, }; use arrow::buffer::Buffer; @@ -36,22 +36,21 @@ use std::sync::Arc; /// Primitive array readers are leaves of array reader tree. They accept page iterator /// and read them into primitive arrays. pub struct PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { data_type: ArrowType, pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } impl PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { /// Construct primitive array reader. pub fn new( @@ -67,14 +66,13 @@ where .clone(), }; - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } @@ -82,9 +80,9 @@ where /// Implementation of primitive array reader. impl ArrayReader for PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { fn as_any(&self) -> &dyn Any { self @@ -178,6 +176,7 @@ where // are datatypes which we must convert explicitly. // These are: // - date64: we should cast int32 to date32, then date32 to date64. + // - decimal: cast in32 to decimal, int64 to decimal let array = match target_type { ArrowType::Date64 => { // this is cheap as it internally reinterprets the data @@ -205,10 +204,10 @@ where return Err(arrow_err!( "Cannot convert {:?} to decimal", array.data_type() - )) + )); } } - .with_precision_and_scale(p, s)?; + .with_precision_and_scale(p, s)?; Arc::new(array) as ArrayRef } @@ -241,17 +240,19 @@ mod tests { use crate::arrow::array_reader::test_util::EmptyPageIterator; use crate::basic::Encoding; use crate::column::page::Page; - use crate::data_type::Int32Type; + use crate::data_type::{Int32Type, Int64Type}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; - use crate::util::test_common::make_pages; + use crate::util::test_common::rand_gen::make_pages; use crate::util::InMemoryPageIterator; - use arrow::array::PrimitiveArray; - use arrow::datatypes::ArrowPrimitiveType; + use arrow::array::{Array, PrimitiveArray}; + use arrow::datatypes::{ArrowPrimitiveType}; use rand::distributions::uniform::SampleUniform; use std::collections::VecDeque; + use arrow::datatypes::DataType::Decimal128; + #[allow(clippy::too_many_arguments)] fn make_column_chunks( column_desc: ColumnDescPtr, encoding: Encoding, @@ -315,7 +316,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); // expect no values to be read let array = array_reader.next_batch(50).unwrap(); @@ -362,7 +363,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); // Read first 50 values, which are all from the first column chunk let array = array_reader.next_batch(50).unwrap(); @@ -562,7 +563,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); let mut accu_len: usize = 0; @@ -603,4 +604,110 @@ mod tests { ); } } + + + #[test] + fn test_primitive_array_reader_decimal_types() { + // parquet `INT32` to decimal + let message_type = " + message test_schema { + REQUIRED INT32 decimal1 (DECIMAL(8,2)); + } + "; + let schema = parse_message_type(message_type) + .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) + .unwrap(); + let column_desc = schema.column(0); + + // create the array reader + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chunks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + -99999999, + 99999999, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = + InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc, + None, + ) + .unwrap(); + + // read data from the reader + // the data type is decimal(8,2) + let array = array_reader.next_batch(50).unwrap(); + assert_eq!(array.data_type(), &Decimal128(8, 2)); + let array = array.as_any().downcast_ref::().unwrap(); + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(8, 2).unwrap(); + assert_eq!(array, &data_decimal_array); + + // not equal with different data type(precision and scale) + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(9, 0).unwrap(); + assert_ne!(array, &data_decimal_array) + } + + // parquet `INT64` to decimal + let message_type = " + message test_schema { + REQUIRED INT64 decimal1 (DECIMAL(18,4)); + } + "; + let schema = parse_message_type(message_type) + .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) + .unwrap(); + let column_desc = schema.column(0); + + // create the array reader + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chunks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + -999999999999999999, + 999999999999999999, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = + InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc, + None, + ) + .unwrap(); + + // read data from the reader + // the data type is decimal(18,4) + let array = array_reader.next_batch(50).unwrap(); + assert_eq!(array.data_type(), &Decimal128(18, 4)); + let array = array.as_any().downcast_ref::().unwrap(); + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(18, 4).unwrap(); + assert_eq!(array, &data_decimal_array); + + // not equal with different data type(precision and scale) + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(34, 0).unwrap(); + assert_ne!(array, &data_decimal_array) + } + } } diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index b333c66cb213..f682f146c721 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -314,7 +314,6 @@ mod tests { let list_reader = ListArrayReader::::new( Box::new(reader), expected_l.data_type().clone(), - ArrowType::Int32, 3, 1, true, diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index da9b8d3bf9b2..ca1aabfd4aa1 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -48,8 +48,7 @@ pub fn utf8_column() -> ColumnDescPtr { /// Encode `data` with the provided `encoding` pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> ByteBufferPtr { - let descriptor = utf8_column(); - let mut encoder = get_encoder::(descriptor, encoding).unwrap(); + let mut encoder = get_encoder::(encoding).unwrap(); encoder.put(data).unwrap(); encoder.flush_buffer().unwrap() diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs new file mode 100644 index 000000000000..8945ccde4248 --- /dev/null +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::arrow::ProjectionMask; +use arrow::array::BooleanArray; +use arrow::error::Result as ArrowResult; +use arrow::record_batch::RecordBatch; + +/// A predicate operating on [`RecordBatch`] +pub trait ArrowPredicate: Send + 'static { + /// Returns the [`ProjectionMask`] that describes the columns required + /// to evaluate this predicate. All projected columns will be provided in the `batch` + /// passed to [`evaluate`](Self::evaluate) + fn projection(&self) -> &ProjectionMask; + + /// Evaluate this predicate for the given [`RecordBatch`] containing the columns + /// identified by [`Self::projection`] + /// + /// Rows that are `true` in the returned [`BooleanArray`] will be returned by the + /// parquet reader, whereas rows that are `false` or `Null` will not be + fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult; +} + +/// An [`ArrowPredicate`] created from an [`FnMut`] +pub struct ArrowPredicateFn { + f: F, + projection: ProjectionMask, +} + +impl ArrowPredicateFn +where + F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, +{ + /// Create a new [`ArrowPredicateFn`]. `f` will be passed batches + /// that contains the columns specified in `projection` + /// and returns a [`BooleanArray`] that describes which rows should + /// be passed along + pub fn new(projection: ProjectionMask, f: F) -> Self { + Self { f, projection } + } +} + +impl ArrowPredicate for ArrowPredicateFn +where + F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, +{ + fn projection(&self) -> &ProjectionMask { + &self.projection + } + + fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + (self.f)(batch) + } +} + +/// A [`RowFilter`] allows pushing down a filter predicate to skip IO and decode +/// +/// This consists of a list of [`ArrowPredicate`] where only the rows that satisfy all +/// of the predicates will be returned. Any [`RowSelection`] will be applied prior +/// to the first predicate, and each predicate in turn will then be used to compute +/// a more refined [`RowSelection`] to use when evaluating the subsequent predicates. +/// +/// Once all predicates have been evaluated, the final [`RowSelection`] is applied +/// to the top-level [`ProjectionMask`] to produce the final output [`RecordBatch`]. +/// +/// This design has a couple of implications: +/// +/// * [`RowFilter`] can be used to skip entire pages, and thus IO, in addition to CPU decode overheads +/// * Columns may be decoded multiple times if they appear in multiple [`ProjectionMask`] +/// * IO will be deferred until needed by a [`ProjectionMask`] +/// +/// As such there is a trade-off between a single large predicate, or multiple predicates, +/// that will depend on the shape of the data. Whilst multiple smaller predicates may +/// minimise the amount of data scanned/decoded, it may not be faster overall. +/// +/// For example, if a predicate that needs a single column of data filters out all but +/// 1% of the rows, applying it as one of the early `ArrowPredicateFn` will likely significantly +/// improve performance. +/// +/// As a counter example, if a predicate needs several columns of data to evaluate but +/// leaves 99% of the rows, it may be better to not filter the data from parquet and +/// apply the filter after the RecordBatch has been fully decoded. +/// +/// [`RowSelection`]: [super::selection::RowSelection] +pub struct RowFilter { + /// A list of [`ArrowPredicate`] + pub(crate) predicates: Vec>, +} + +impl RowFilter { + /// Create a new [`RowFilter`] from an array of [`ArrowPredicate`] + pub fn new(predicates: Vec>) -> Self { + Self { predicates } + } +} diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader/mod.rs similarity index 80% rename from parquet/src/arrow/arrow_reader.rs rename to parquet/src/arrow/arrow_reader/mod.rs index 3cd5cb9d4ed9..e363919f6516 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -21,6 +21,7 @@ use std::collections::VecDeque; use std::sync::Arc; use arrow::array::Array; +use arrow::compute::prep_null_mask_filter; use arrow::datatypes::{DataType as ArrowType, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::{RecordBatch, RecordBatchReader}; @@ -36,6 +37,17 @@ use crate::file::reader::{ChunkReader, FileReader, SerializedFileReader}; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::schema::types::SchemaDescriptor; +#[allow(unused)] +mod filter; +#[allow(unused)] +mod selection; + +// TODO: Make these public once stable (#1792) +#[allow(unused_imports)] +pub(crate) use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; +#[allow(unused_imports)] +pub(crate) use selection::{RowSelection, RowSelector}; + /// Arrow reader api. /// With this api, user can get arrow schema from parquet file, and read parquet data /// into arrow arrays. @@ -72,44 +84,15 @@ pub trait ArrowReader { ) -> Result; } -/// [`RowSelection`] allows selecting or skipping a provided number of rows -/// when scanning the parquet file -#[derive(Debug, Clone, Copy)] -pub(crate) struct RowSelection { - /// The number of rows - pub row_count: usize, - - /// If true, skip `row_count` rows - pub skip: bool, -} - -impl RowSelection { - /// Select `row_count` rows - pub fn select(row_count: usize) -> Self { - Self { - row_count, - skip: false, - } - } - - /// Skip `row_count` rows - pub fn skip(row_count: usize) -> Self { - Self { - row_count, - skip: true, - } - } -} - #[derive(Debug, Clone, Default)] pub struct ArrowReaderOptions { skip_arrow_metadata: bool, - selection: Option>, + selection: Option, } impl ArrowReaderOptions { /// Create a new [`ArrowReaderOptions`] with the default settings - fn new() -> Self { + pub fn new() -> Self { Self::default() } @@ -128,11 +111,9 @@ impl ArrowReaderOptions { /// Scan rows from the parquet file according to the provided `selection` /// - /// TODO: Make public once row selection fully implemented (#1792) - pub(crate) fn with_row_selection( - self, - selection: impl Into>, - ) -> Self { + /// TODO: Revisit this API, as [`Self`] is provided before the file metadata is available + #[allow(unused)] + pub(crate) fn with_row_selection(self, selection: impl Into) -> Self { Self { selection: Some(selection.into()), ..self @@ -140,6 +121,9 @@ impl ArrowReaderOptions { } } +/// An `ArrowReader` that can be used to synchronously read parquet data as [`RecordBatch`] +/// +/// See [`crate::arrow::async_reader`] for an asynchronous interface pub struct ParquetFileArrowReader { file_reader: Arc, @@ -175,21 +159,13 @@ impl ArrowReader for ParquetFileArrowReader { mask: ProjectionMask, batch_size: usize, ) -> Result { - let array_reader = build_array_reader( - self.file_reader - .metadata() - .file_metadata() - .schema_descr_ptr(), - Arc::new(self.get_schema()?), - mask, - Box::new(self.file_reader.clone()), - )?; + let array_reader = + build_array_reader(Arc::new(self.get_schema()?), mask, &self.file_reader)?; - let selection = self.options.selection.clone().map(Into::into); Ok(ParquetRecordBatchReader::new( batch_size, array_reader, - selection, + self.options.selection.clone(), )) } } @@ -276,54 +252,68 @@ impl ParquetFileArrowReader { } } +/// An `Iterator>` that yields [`RecordBatch`] +/// read from a parquet data source pub struct ParquetRecordBatchReader { batch_size: usize, array_reader: Box, schema: SchemaRef, - selection: Option>, + selection: Option>, } impl Iterator for ParquetRecordBatchReader { type Item = ArrowResult; fn next(&mut self) -> Option { - let to_read = match self.selection.as_mut() { - Some(selection) => loop { - let front = selection.pop_front()?; - if front.skip { - let skipped = match self.array_reader.skip_records(front.row_count) { - Ok(skipped) => skipped, - Err(e) => return Some(Err(e.into())), - }; - - if skipped != front.row_count { - return Some(Err(general_err!( - "failed to skip rows, expected {}, got {}", - front.row_count, - skipped - ) - .into())); + let mut read_records = 0; + match self.selection.as_mut() { + Some(selection) => { + while read_records < self.batch_size && !selection.is_empty() { + let front = selection.pop_front().unwrap(); + if front.skip { + let skipped = + match self.array_reader.skip_records(front.row_count) { + Ok(skipped) => skipped, + Err(e) => return Some(Err(e.into())), + }; + + if skipped != front.row_count { + return Some(Err(general_err!( + "failed to skip rows, expected {}, got {}", + front.row_count, + skipped + ) + .into())); + } + continue; } - continue; - } - // try to read record - let to_read = match front.row_count.checked_sub(self.batch_size) { - Some(remaining) if remaining != 0 => { - // if page row count less than batch_size we must set batch size to page row count. - // add check avoid dead loop - selection.push_front(RowSelection::select(remaining)); - self.batch_size + // try to read record + let need_read = self.batch_size - read_records; + let to_read = match front.row_count.checked_sub(need_read) { + Some(remaining) if remaining != 0 => { + // if page row count less than batch_size we must set batch size to page row count. + // add check avoid dead loop + selection.push_front(RowSelector::select(remaining)); + need_read + } + _ => front.row_count, + }; + match self.array_reader.read_records(to_read) { + Ok(0) => break, + Ok(rec) => read_records += rec, + Err(error) => return Some(Err(error.into())), } - _ => front.row_count, - }; - - break to_read; - }, - None => self.batch_size, + } + } + None => { + if let Err(error) = self.array_reader.read_records(self.batch_size) { + return Some(Err(error.into())); + } + } }; - match self.array_reader.next_batch(to_read) { + match self.array_reader.consume_batch() { Err(error) => Some(Err(error.into())), Ok(array) => { let struct_array = @@ -349,22 +339,13 @@ impl RecordBatchReader for ParquetRecordBatchReader { } impl ParquetRecordBatchReader { - pub fn try_new( - batch_size: usize, - array_reader: Box, - ) -> Result { - Ok(Self::new(batch_size, array_reader, None)) - } - /// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at /// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` /// all rows will be returned - /// - /// TODO: Make public once row selection fully implemented (#1792) pub(crate) fn new( batch_size: usize, array_reader: Box, - selection: Option>, + selection: Option, ) -> Self { let schema = match array_reader.get_data_type() { ArrowType::Struct(ref fields) => Schema::new(fields.clone()), @@ -375,11 +356,41 @@ impl ParquetRecordBatchReader { batch_size, array_reader, schema: Arc::new(schema), - selection, + selection: selection.map(Into::into), } } } +/// Evaluates an [`ArrowPredicate`] returning the [`RowSelection`] +/// +/// If this [`ParquetRecordBatchReader`] has a [`RowSelection`], the +/// returned [`RowSelection`] will be the conjunction of this and +/// the rows selected by `predicate` +#[allow(unused)] +pub(crate) fn evaluate_predicate( + batch_size: usize, + array_reader: Box, + input_selection: Option, + predicate: &mut dyn ArrowPredicate, +) -> Result { + let reader = + ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone()); + let mut filters = vec![]; + for maybe_batch in reader { + let filter = predicate.evaluate(maybe_batch?)?; + match filter.null_count() { + 0 => filters.push(filter), + _ => filters.push(prep_null_mask_filter(&filter)), + }; + } + + let raw = RowSelection::from_filters(&filters); + Ok(match input_selection { + Some(selection) => selection.and_then(&raw), + None => raw, + }) +} + #[cfg(test)] mod tests { use bytes::Bytes; @@ -391,9 +402,7 @@ mod tests { use std::path::PathBuf; use std::sync::Arc; - use rand::{thread_rng, RngCore}; - use serde_json::json; - use serde_json::Value::{Array as JArray, Null as JNull, Object as JObject}; + use rand::{thread_rng, Rng, RngCore}; use tempfile::tempfile; use arrow::array::*; @@ -404,11 +413,10 @@ mod tests { use crate::arrow::arrow_reader::{ ArrowReader, ArrowReaderOptions, ParquetFileArrowReader, - ParquetRecordBatchReader, RowSelection, + ParquetRecordBatchReader, RowSelection, RowSelector, }; use crate::arrow::buffer::converter::{ - BinaryArrayConverter, Converter, FixedSizeArrayConverter, FromConverter, - IntervalDayTimeArrayConverter, LargeUtf8ArrayConverter, Utf8ArrayConverter, + Converter, FixedSizeArrayConverter, IntervalDayTimeArrayConverter, }; use crate::arrow::schema::add_encoded_arrow_schema_to_metadata; use crate::arrow::{ArrowWriter, ProjectionMask}; @@ -423,54 +431,35 @@ mod tests { use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; - use crate::util::test_common::RandGen; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_arrow_reader_all_columns() { - let json_values = get_json_array("parquet/generated_simple_numerics/blogs.json"); - let parquet_file_reader = get_test_reader("parquet/generated_simple_numerics/blogs.parquet"); - let max_len = parquet_file_reader.metadata().file_metadata().num_rows() as usize; - let mut arrow_reader = ParquetFileArrowReader::new(parquet_file_reader); - let mut record_batch_reader = arrow_reader + let record_batch_reader = arrow_reader .get_record_reader(60) .expect("Failed to read into array!"); // Verify that the schema was correctly parsed let original_schema = arrow_reader.get_schema().unwrap().fields().clone(); assert_eq!(original_schema, *record_batch_reader.schema().fields()); - - compare_batch_json(&mut record_batch_reader, json_values, max_len); } #[test] fn test_arrow_reader_single_column() { - let json_values = get_json_array("parquet/generated_simple_numerics/blogs.json"); - - let projected_json_values = json_values - .into_iter() - .map(|value| match value { - JObject(fields) => { - json!({ "blog_id": fields.get("blog_id").unwrap_or(&JNull).clone()}) - } - _ => panic!("Input should be json object array!"), - }) - .collect::>(); - let parquet_file_reader = get_test_reader("parquet/generated_simple_numerics/blogs.parquet"); let file_metadata = parquet_file_reader.metadata().file_metadata(); - let max_len = file_metadata.num_rows() as usize; let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [2]); let mut arrow_reader = ParquetFileArrowReader::new(parquet_file_reader); - let mut record_batch_reader = arrow_reader + let record_batch_reader = arrow_reader .get_record_reader_by_columns(mask, 60) .expect("Failed to read into array!"); @@ -478,8 +467,6 @@ mod tests { let original_schema = arrow_reader.get_schema().unwrap().fields().clone(); assert_eq!(1, record_batch_reader.schema().fields().len()); assert_eq!(original_schema[1], record_batch_reader.schema().fields()[0]); - - compare_batch_json(&mut record_batch_reader, projected_json_values, max_len); } #[test] @@ -525,29 +512,29 @@ mod tests { #[test] fn test_primitive_single_column_reader_test() { - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(BooleanArray::from_iter(vals.iter().cloned())), &[Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY], ); - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(Int32Array::from_iter(vals.iter().cloned())), &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BINARY_PACKED, ], ); - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(Int64Array::from_iter(vals.iter().cloned())), &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, @@ -569,16 +556,11 @@ mod tests { #[test] fn test_fixed_length_binary_column_reader() { let converter = FixedSizeArrayConverter::new(20); - run_single_column_reader_tests::< - FixedLenByteArrayType, - FixedSizeBinaryArray, - FixedSizeArrayConverter, - RandFixedLenGen, - >( + run_single_column_reader_tests::( 20, ConvertedType::NONE, None, - &converter, + |vals| Arc::new(converter.convert(vals.to_vec()).unwrap()), &[Encoding::PLAIN, Encoding::RLE_DICTIONARY], ); } @@ -586,16 +568,11 @@ mod tests { #[test] fn test_interval_day_time_column_reader() { let converter = IntervalDayTimeArrayConverter {}; - run_single_column_reader_tests::< - FixedLenByteArrayType, - IntervalDayTimeArray, - IntervalDayTimeArrayConverter, - RandFixedLenGen, - >( + run_single_column_reader_tests::( 12, ConvertedType::INTERVAL, None, - &converter, + |vals| Arc::new(converter.convert(vals.to_vec()).unwrap()), &[Encoding::PLAIN, Encoding::RLE_DICTIONARY], ); } @@ -610,6 +587,12 @@ mod tests { #[test] fn test_utf8_single_column_reader_test() { + fn string_converter(vals: &[Option]) -> ArrayRef { + Arc::new(GenericStringArray::::from_iter(vals.iter().map(|x| { + x.as_ref().map(|b| std::str::from_utf8(b.data()).unwrap()) + }))) + } + let encodings = &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, @@ -617,46 +600,39 @@ mod tests { Encoding::DELTA_BYTE_ARRAY, ]; - let converter = BinaryArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - BinaryArray, - BinaryArrayConverter, - RandUtf8Gen, - >(2, ConvertedType::NONE, None, &converter, encodings); - - let utf8_converter = Utf8ArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >(2, ConvertedType::UTF8, None, &utf8_converter, encodings); - - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + run_single_column_reader_tests::( + 2, + ConvertedType::NONE, + None, + |vals| { + Arc::new(BinaryArray::from_iter( + vals.iter().map(|x| x.as_ref().map(|x| x.data())), + )) + }, + encodings, + ); + + run_single_column_reader_tests::( + 2, + ConvertedType::UTF8, + None, + string_converter::, + encodings, + ); + + run_single_column_reader_tests::( 2, ConvertedType::UTF8, Some(ArrowDataType::Utf8), - &utf8_converter, + string_converter::, encodings, ); - let large_utf8_converter = LargeUtf8ArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - LargeStringArray, - LargeUtf8ArrayConverter, - RandUtf8Gen, - >( + run_single_column_reader_tests::( 2, ConvertedType::UTF8, Some(ArrowDataType::LargeUtf8), - &large_utf8_converter, + string_converter::, encodings, ); @@ -666,21 +642,21 @@ mod tests { let mut opts = TestOptions::new(2, 20, 15).with_null_percent(50); opts.encoding = *encoding; + let data_type = ArrowDataType::Dictionary( + Box::new(key.clone()), + Box::new(ArrowDataType::Utf8), + ); + // Cannot run full test suite as keys overflow, run small test instead - single_column_reader_test::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + single_column_reader_test::( opts, 2, ConvertedType::UTF8, - Some(ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - )), - &utf8_converter, + Some(data_type.clone()), + move |vals| { + let vals = string_converter::(vals); + arrow::compute::cast(&vals, &data_type).unwrap() + }, ); } } @@ -695,37 +671,37 @@ mod tests { ]; for key in &key_types { - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + let data_type = ArrowDataType::Dictionary( + Box::new(key.clone()), + Box::new(ArrowDataType::Utf8), + ); + + run_single_column_reader_tests::( 2, ConvertedType::UTF8, - Some(ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - )), - &utf8_converter, + Some(data_type.clone()), + move |vals| { + let vals = string_converter::(vals); + arrow::compute::cast(&vals, &data_type).unwrap() + }, encodings, ); // https://github.com/apache/arrow-rs/issues/1179 - // run_single_column_reader_tests::< - // ByteArrayType, - // LargeStringArray, - // LargeUtf8ArrayConverter, - // RandUtf8Gen, - // >( + // let data_type = ArrowDataType::Dictionary( + // Box::new(key.clone()), + // Box::new(ArrowDataType::LargeUtf8), + // ); + // + // run_single_column_reader_tests::( // 2, // ConvertedType::UTF8, - // Some(ArrowDataType::Dictionary( - // Box::new(key.clone()), - // Box::new(ArrowDataType::LargeUtf8), - // )), - // &large_utf8_converter, - // encodings + // Some(data_type.clone()), + // move |vals| { + // let vals = string_converter::(vals); + // arrow::compute::cast(&vals, &data_type).unwrap() + // }, + // encodings, // ); } } @@ -912,6 +888,8 @@ mod tests { enabled_statistics: EnabledStatistics, /// Encoding encoding: Encoding, + //row selections and total selected row count + row_selections: Option<(RowSelection, usize)>, } impl Default for TestOptions { @@ -927,6 +905,7 @@ mod tests { writer_version: WriterVersion::PARQUET_1_0, enabled_statistics: EnabledStatistics::Page, encoding: Encoding::PLAIN, + row_selections: None, } } } @@ -969,6 +948,20 @@ mod tests { } } + fn with_row_selections(self) -> Self { + let mut rng = thread_rng(); + let step = rng.gen_range(self.record_batch_size..self.num_rows); + let row_selections = create_test_selection( + step, + self.num_row_groups * self.num_rows, + rng.gen::(), + ); + Self { + row_selections: Some(row_selections), + ..self + } + } + fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() .set_data_pagesize_limit(self.max_data_page_size) @@ -995,19 +988,18 @@ mod tests { /// /// `rand_max` represents the maximum size of value to pass to to /// value generator - fn run_single_column_reader_tests( + fn run_single_column_reader_tests( rand_max: i32, converted_type: ConvertedType, arrow_type: Option, - converter: &C, + converter: F, encodings: &[Encoding], ) where T: DataType, G: RandGen, - A: Array + 'static, - C: Converter>, A> + 'static, + F: Fn(&[Option]) -> ArrayRef, { - let all_options = vec![ + let mut all_options = vec![ // choose record_batch_batch (15) so batches cross row // group boundaries (50 rows in 2 row groups) cases. TestOptions::new(2, 100, 15), @@ -1038,6 +1030,39 @@ mod tests { .with_enabled_statistics(EnabledStatistics::None), ]; + let skip_options = vec![ + // choose record_batch_batch (15) so batches cross row + // group boundaries (50 rows in 2 row groups) cases. + TestOptions::new(2, 100, 15).with_row_selections(), + // choose record_batch_batch (5) so batches sometime fall + // on row group boundaries and (25 rows in 3 row groups + // --> row groups of 10, 10, and 5). Tests buffer + // refilling edge cases. + TestOptions::new(3, 25, 5).with_row_selections(), + // Choose record_batch_size (25) so all batches fall + // exactly on row group boundary (25). Tests buffer + // refilling edge cases. + TestOptions::new(4, 100, 25).with_row_selections(), + // Set maximum page size so row groups have multiple pages + TestOptions::new(3, 256, 73) + .with_max_data_page_size(128) + .with_row_selections(), + // Set small dictionary page size to test dictionary fallback + TestOptions::new(3, 256, 57) + .with_max_dict_page_size(128) + .with_row_selections(), + // Test optional but with no nulls + TestOptions::new(2, 256, 127) + .with_null_percent(0) + .with_row_selections(), + // Test optional with nulls + TestOptions::new(2, 256, 93) + .with_null_percent(25) + .with_row_selections(), + ]; + + all_options.extend(skip_options); + all_options.into_iter().for_each(|opts| { for writer_version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { @@ -1045,15 +1070,15 @@ mod tests { let opts = TestOptions { writer_version, encoding: *encoding, - ..opts + ..opts.clone() }; - single_column_reader_test::( + single_column_reader_test::( opts, rand_max, converted_type, arrow_type.clone(), - converter, + &converter, ) } } @@ -1063,24 +1088,24 @@ mod tests { /// Create a parquet file and then read it using /// `ParquetFileArrowReader` using the parameters described in /// `opts`. - fn single_column_reader_test( + fn single_column_reader_test( opts: TestOptions, rand_max: i32, converted_type: ConvertedType, arrow_type: Option, - converter: &C, + converter: F, ) where T: DataType, G: RandGen, - A: Array + 'static, - C: Converter>, A> + 'static, + F: Fn(&[Option]) -> ArrayRef, { // Print out options to facilitate debugging failures on CI println!( - "Running single_column_reader_test ConvertedType::{}/ArrowType::{:?} with Options: {:?}", - converted_type, arrow_type, opts + "Running type {:?} single_column_reader_test ConvertedType::{}/ArrowType::{:?} with Options: {:?}", + T::get_physical_type(), converted_type, arrow_type, opts ); + //according to null_percent generate def_levels let (repetition, def_levels) = match opts.null_percent.as_ref() { Some(null_percent) => { let mut rng = thread_rng(); @@ -1099,6 +1124,7 @@ mod tests { None => (Repetition::REQUIRED, None), }; + //generate random table data let values: Vec> = (0..opts.num_row_groups) .map(|idx| { let null_count = match def_levels.as_ref() { @@ -1131,9 +1157,7 @@ mod tests { .unwrap(), ); - let arrow_field = arrow_type - .clone() - .map(|t| arrow::datatypes::Field::new("leaf", t, false)); + let arrow_field = arrow_type.map(|t| Field::new("leaf", t, false)); let mut file = tempfile::tempfile().unwrap(); @@ -1149,29 +1173,37 @@ mod tests { file.rewind().unwrap(); - let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + let mut arrow_reader; + let expected_data: Vec>; + if let Some((selections, row_count)) = opts.row_selections.clone() { + let options = + ArrowReaderOptions::new().with_row_selection(selections.clone()); + arrow_reader = + ParquetFileArrowReader::try_new_with_options(file, options).unwrap(); + let mut without_skip_data = gen_expected_data::(&def_levels, &values); + + let mut skip_data: Vec> = vec![]; + let selections: VecDeque = selections.into(); + for select in selections { + if select.skip { + without_skip_data.drain(0..select.row_count); + } else { + skip_data.extend(without_skip_data.drain(0..select.row_count)); + } + } + expected_data = skip_data; + assert_eq!(expected_data.len(), row_count); + } else { + arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + //get flatten table data + expected_data = gen_expected_data::(&def_levels, &values); + assert_eq!(expected_data.len(), opts.num_rows * opts.num_row_groups); + } + let mut record_reader = arrow_reader .get_record_reader(opts.record_batch_size) .unwrap(); - let expected_data: Vec> = match def_levels { - Some(levels) => { - let mut values_iter = values.iter().flatten(); - levels - .iter() - .flatten() - .map(|d| match d { - 1 => Some(values_iter.next().cloned().unwrap()), - 0 => None, - _ => unreachable!(), - }) - .collect() - } - None => values.iter().flatten().map(|b| Some(b.clone())).collect(), - }; - - assert_eq!(expected_data.len(), opts.num_rows * opts.num_row_groups); - let mut total_read = 0; loop { let maybe_batch = record_reader.next(); @@ -1180,19 +1212,9 @@ mod tests { let batch = maybe_batch.unwrap().unwrap(); assert_eq!(end - total_read, batch.num_rows()); - let mut data = vec![]; - data.extend_from_slice(&expected_data[total_read..end]); - - let a = converter.convert(data).unwrap(); - let mut b = Arc::clone(batch.column(0)); + let a = converter(&expected_data[total_read..end]); + let b = Arc::clone(batch.column(0)); - if let Some(arrow_type) = arrow_type.as_ref() { - assert_eq!(b.data_type(), arrow_type); - if let ArrowDataType::Dictionary(_, v) = arrow_type { - assert_eq!(a.data_type(), v.as_ref()); - b = arrow::compute::cast(&b, v.as_ref()).unwrap() - } - } assert_eq!(a.data_type(), b.data_type()); assert_eq!(a.data(), b.data(), "{:#?} vs {:#?}", a.data(), b.data()); @@ -1204,17 +1226,39 @@ mod tests { } } + fn gen_expected_data( + def_levels: &Option>>, + values: &[Vec], + ) -> Vec> { + let data: Vec> = match def_levels { + Some(levels) => { + let mut values_iter = values.iter().flatten(); + levels + .iter() + .flatten() + .map(|d| match d { + 1 => Some(values_iter.next().cloned().unwrap()), + 0 => None, + _ => unreachable!(), + }) + .collect() + } + None => values.iter().flatten().map(|b| Some(b.clone())).collect(), + }; + data + } + fn generate_single_column_file_with_data( values: &[Vec], def_levels: Option<&Vec>>, file: File, schema: TypePtr, - field: Option, + field: Option, opts: &TestOptions, ) -> Result { let mut writer_props = opts.writer_props(); if let Some(field) = field { - let arrow_schema = arrow::datatypes::Schema::new(vec![field]); + let arrow_schema = Schema::new(vec![field]); add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut writer_props); } @@ -1257,39 +1301,6 @@ mod tests { File::open(path.as_path()).expect("File not found!") } - fn get_json_array(filename: &str) -> Vec { - match serde_json::from_reader(get_test_file(filename)) - .expect("Failed to read json value from file!") - { - JArray(values) => values, - _ => panic!("Input should be json array!"), - } - } - - fn compare_batch_json( - record_batch_reader: &mut dyn RecordBatchReader, - json_values: Vec, - max_len: usize, - ) { - for i in 0..20 { - let array: Option = record_batch_reader - .next() - .map(|r| r.expect("Failed to read record batch!").into()); - - let (start, end) = (i * 60_usize, (i + 1) * 60_usize); - - if start < max_len { - assert!(array.is_some()); - assert_ne!(0, array.as_ref().unwrap().len()); - let end = min(end, max_len); - let json = JArray(Vec::from(&json_values[start..end])); - assert_eq!(array.unwrap(), json) - } else { - assert!(array.is_none()); - } - } - } - #[test] fn test_read_structs() { // This particular test file has columns of struct types where there is @@ -1749,12 +1760,12 @@ mod tests { /// a `batch_size` and `selection` fn get_expected_batches( column: &RecordBatch, - selection: &[RowSelection], + selection: &RowSelection, batch_size: usize, ) -> Vec { let mut expected_batches = vec![]; - let mut selection: VecDeque<_> = selection.iter().cloned().collect(); + let mut selection: VecDeque<_> = selection.clone().into(); let mut row_offset = 0; let mut last_start = None; while row_offset < column.num_rows() && !selection.is_empty() { @@ -1802,6 +1813,34 @@ mod tests { expected_batches } + fn create_test_selection( + step_len: usize, + total_len: usize, + skip_first: bool, + ) -> (RowSelection, usize) { + let mut remaining = total_len; + let mut skip = skip_first; + let mut vec = vec![]; + let mut selected_count = 0; + while remaining != 0 { + let step = if remaining > step_len { + step_len + } else { + remaining + }; + vec.push(RowSelector { + row_count: step, + skip, + }); + remaining -= step; + if !skip { + selected_count += step; + } + skip = !skip; + } + (vec.into(), selected_count) + } + #[test] fn test_scan_row_with_selection() { let testdata = arrow::util::test_util::parquet_test_data(); @@ -1816,7 +1855,7 @@ mod tests { let do_test = |batch_size: usize, selection_len: usize| { for skip_first in [false, true] { let selections = - create_test_selection(batch_size, data.num_rows(), skip_first); + create_test_selection(batch_size, data.num_rows(), skip_first).0; let expected = get_expected_batches(&data, &selections, batch_size); let skip_reader = create_skip_reader(&test_file, batch_size, selections); @@ -1848,7 +1887,7 @@ mod tests { fn create_skip_reader( test_file: &File, batch_size: usize, - selections: Vec, + selections: RowSelection, ) -> ParquetRecordBatchReader { let arrow_reader_options = ArrowReaderOptions::new().with_row_selection(selections); @@ -1860,29 +1899,5 @@ mod tests { .unwrap(); skip_arrow_reader.get_record_reader(batch_size).unwrap() } - - fn create_test_selection( - step_len: usize, - total_len: usize, - skip_first: bool, - ) -> Vec { - let mut remaining = total_len; - let mut skip = skip_first; - let mut vec = vec![]; - while remaining != 0 { - let step = if remaining > step_len { - step_len - } else { - remaining - }; - vec.push(RowSelection { - row_count: step, - skip, - }); - remaining -= step; - skip = !skip; - } - vec - } } } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs new file mode 100644 index 000000000000..8e129f5667ec --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -0,0 +1,426 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, BooleanArray}; +use arrow::compute::SlicesIterator; +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::ops::Range; + +/// [`RowSelector`] represents a range of rows to scan from a parquet file +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RowSelector { + /// The number of rows + pub row_count: usize, + + /// If true, skip `row_count` rows + pub skip: bool, +} + +impl RowSelector { + /// Select `row_count` rows + pub fn select(row_count: usize) -> Self { + Self { + row_count, + skip: false, + } + } + + /// Skip `row_count` rows + pub fn skip(row_count: usize) -> Self { + Self { + row_count, + skip: true, + } + } +} + +/// [`RowSelection`] allows selecting or skipping a provided number of rows +/// when scanning the parquet file. +/// +/// This is applied prior to reading column data, and can therefore +/// be used to skip IO to fetch data into memory +/// +/// A typical use-case would be using the [`PageIndex`] to filter out rows +/// that don't satisfy a predicate +/// +/// [`PageIndex`]: [crate::file::page_index::index::PageIndex] +#[derive(Debug, Clone, Default, Eq, PartialEq)] +pub struct RowSelection { + selectors: Vec, +} + +impl RowSelection { + /// Creates a [`RowSelection`] from a slice of [`BooleanArray`] + /// + /// # Panic + /// + /// Panics if any of the [`BooleanArray`] contain nulls + pub fn from_filters(filters: &[BooleanArray]) -> Self { + let mut next_offset = 0; + let total_rows = filters.iter().map(|x| x.len()).sum(); + + let iter = filters.iter().flat_map(|filter| { + let offset = next_offset; + next_offset += filter.len(); + assert_eq!(filter.null_count(), 0); + SlicesIterator::new(filter) + .map(move |(start, end)| start + offset..end + offset) + }); + + Self::from_consecutive_ranges(iter, total_rows) + } + + /// Creates a [`RowSelection`] from an iterator of consecutive ranges to keep + fn from_consecutive_ranges>>( + ranges: I, + total_rows: usize, + ) -> Self { + let mut selectors: Vec = Vec::with_capacity(ranges.size_hint().0); + let mut last_end = 0; + for range in ranges { + let len = range.end - range.start; + + match range.start.cmp(&last_end) { + Ordering::Equal => match selectors.last_mut() { + Some(last) => last.row_count += len, + None => selectors.push(RowSelector::select(len)), + }, + Ordering::Greater => { + selectors.push(RowSelector::skip(range.start - last_end)); + selectors.push(RowSelector::select(len)) + } + Ordering::Less => panic!("out of order"), + } + last_end = range.end; + } + + if last_end != total_rows { + selectors.push(RowSelector::skip(total_rows - last_end)) + } + + Self { selectors } + } + + /// Splits off the first `row_count` from this [`RowSelection`] + pub fn split_off(&mut self, row_count: usize) -> Self { + let mut total_count = 0; + + // Find the index where the selector exceeds the row count + let find = self.selectors.iter().enumerate().find(|(_, selector)| { + total_count += selector.row_count; + total_count > row_count + }); + + let split_idx = match find { + Some((idx, _)) => idx, + None => { + let selectors = std::mem::take(&mut self.selectors); + return Self { selectors }; + } + }; + + let mut remaining = self.selectors.split_off(split_idx); + + // Always present as `split_idx < self.selectors.len` + let next = remaining.first_mut().unwrap(); + let overflow = total_count - row_count; + + if next.row_count != overflow { + self.selectors.push(RowSelector { + row_count: next.row_count - overflow, + skip: next.skip, + }) + } + next.row_count = overflow; + + std::mem::swap(&mut remaining, &mut self.selectors); + Self { + selectors: remaining, + } + } + + /// Given a [`RowSelection`] computed under `self`, returns the [`RowSelection`] + /// representing their conjunction + /// + /// For example: + /// + /// self: NNNNNNNNNNNNYYYYYYYYYYYYYYYYYYYYYYNNNYYYYY + /// other: YYYYYNNNNYYYYYYYYYYYYY YYNNN + /// + /// returned: NNNNNNNNNNNNYYYYYNNNNYYYYYYYYYYYYYYNNYNNNN + /// + /// + pub fn and_then(&self, other: &Self) -> Self { + let mut selectors = vec![]; + let mut first = self.selectors.iter().cloned().peekable(); + let mut second = other.selectors.iter().cloned().peekable(); + + let mut to_skip = 0; + while let Some(b) = second.peek_mut() { + let a = first.peek_mut().unwrap(); + + if b.row_count == 0 { + second.next().unwrap(); + continue; + } + + if a.row_count == 0 { + first.next().unwrap(); + continue; + } + + if a.skip { + // Records were skipped when producing second + to_skip += a.row_count; + first.next().unwrap(); + continue; + } + + let skip = b.skip; + let to_process = a.row_count.min(b.row_count); + + a.row_count -= to_process; + b.row_count -= to_process; + + match skip { + true => to_skip += to_process, + false => { + if to_skip != 0 { + selectors.push(RowSelector::skip(to_skip)); + to_skip = 0; + } + selectors.push(RowSelector::select(to_process)) + } + } + } + + for v in first { + if v.row_count != 0 { + assert!(v.skip); + to_skip += v.row_count + } + } + + if to_skip != 0 { + selectors.push(RowSelector::skip(to_skip)); + } + + Self { selectors } + } + + /// Returns `true` if this [`RowSelection`] selects any rows + pub fn selects_any(&self) -> bool { + self.selectors.iter().any(|x| !x.skip) + } +} + +impl From> for RowSelection { + fn from(selectors: Vec) -> Self { + Self { selectors } + } +} + +impl From for VecDeque { + fn from(r: RowSelection) -> Self { + r.selectors.into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::{thread_rng, Rng}; + + #[test] + fn test_from_filters() { + let filters = vec![ + BooleanArray::from(vec![false, false, false, true, true, true, true]), + BooleanArray::from(vec![true, true, false, false, true, true, true]), + BooleanArray::from(vec![false, false, false, false]), + BooleanArray::from(Vec::::new()), + ]; + + let selection = RowSelection::from_filters(&filters[..1]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(3), RowSelector::select(4)] + ); + + let selection = RowSelection::from_filters(&filters[..2]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3) + ] + ); + + let selection = RowSelection::from_filters(&filters); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(4) + ] + ); + + let selection = RowSelection::from_filters(&filters[2..3]); + assert!(!selection.selects_any()); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); + } + + #[test] + fn test_split_off() { + let mut selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let split = selection.split_off(34); + assert_eq!(split.selectors, vec![RowSelector::skip(34)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(5); + assert_eq!(split.selectors, vec![RowSelector::select(5)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(7), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(8); + assert_eq!( + split.selectors, + vec![RowSelector::select(7), RowSelector::skip(1)] + ); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + + let split = selection.split_off(200); + assert_eq!( + split.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + assert!(selection.selectors.is_empty()); + } + + #[test] + fn test_and() { + let mut a = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(23), + RowSelector::skip(3), + RowSelector::select(5), + ]); + + let b = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(15), + RowSelector::skip(4), + ]); + + let mut expected = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(14), + RowSelector::skip(3), + RowSelector::select(1), + RowSelector::skip(4), + ]); + + assert_eq!(a.and_then(&b), expected); + + a.split_off(7); + expected.split_off(7); + assert_eq!(a.and_then(&b), expected); + + let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); + + let b = RowSelection::from(vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + + assert_eq!( + a.and_then(&b).selectors, + vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(4) + ] + ); + } + + #[test] + fn test_and_fuzz() { + let mut rand = thread_rng(); + for _ in 0..100 { + let a_len = rand.gen_range(10..100); + let a_bools: Vec<_> = (0..a_len).map(|x| rand.gen_bool(0.2)).collect(); + let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); + + let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); + let b_bools: Vec<_> = (0..b_len).map(|x| rand.gen_bool(0.8)).collect(); + let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); + + let mut expected_bools = vec![false; a_len]; + + let mut iter_b = b_bools.iter(); + for (idx, b) in a_bools.iter().enumerate() { + if *b && *iter_b.next().unwrap() { + expected_bools[idx] = true; + } + } + + let expected = + RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); + + let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); + assert_eq!(a_len, total_rows); + + assert_eq!(a.and_then(&b), expected); + } + } +} diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index d1a0da5b391d..a7b6ccc3fc85 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -16,7 +16,6 @@ // under the License. use crate::arrow::arrow_writer::levels::LevelInfo; -use crate::arrow::arrow_writer::ArrayWriter; use crate::basic::Encoding; use crate::column::page::PageWriter; use crate::column::writer::encoder::{ @@ -33,11 +32,38 @@ use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow::array::{ - Array, ArrayAccessor, ArrayRef, BinaryArray, LargeBinaryArray, LargeStringArray, - StringArray, + Array, ArrayAccessor, ArrayRef, BinaryArray, DictionaryArray, LargeBinaryArray, + LargeStringArray, StringArray, }; use arrow::datatypes::DataType; +macro_rules! downcast_dict_impl { + ($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{ + $op($array + .as_any() + .downcast_ref::>() + .unwrap() + .downcast_dict::<$val>() + .unwrap()$(, $arg)*) + }}; +} + +macro_rules! downcast_dict_op { + ($key_type:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => { + match $key_type.as_ref() { + DataType::UInt8 => downcast_dict_impl!($array, UInt8Type, $val, $op$(, $arg)*), + DataType::UInt16 => downcast_dict_impl!($array, UInt16Type, $val, $op$(, $arg)*), + DataType::UInt32 => downcast_dict_impl!($array, UInt32Type, $val, $op$(, $arg)*), + DataType::UInt64 => downcast_dict_impl!($array, UInt64Type, $val, $op$(, $arg)*), + DataType::Int8 => downcast_dict_impl!($array, Int8Type, $val, $op$(, $arg)*), + DataType::Int16 => downcast_dict_impl!($array, Int16Type, $val, $op$(, $arg)*), + DataType::Int32 => downcast_dict_impl!($array, Int32Type, $val, $op$(, $arg)*), + DataType::Int64 => downcast_dict_impl!($array, Int64Type, $val, $op$(, $arg)*), + _ => unreachable!(), + } + }; +} + macro_rules! downcast_op { ($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => { match $data_type { @@ -51,36 +77,44 @@ macro_rules! downcast_op { DataType::LargeBinary => { $op($array.as_any().downcast_ref::().unwrap()$(, $arg)*) } - d => unreachable!("cannot downcast {} to byte array", d) + DataType::Dictionary(key, value) => match value.as_ref() { + DataType::Utf8 => downcast_dict_op!(key, StringArray, $array, $op$(, $arg)*), + DataType::LargeUtf8 => { + downcast_dict_op!(key, LargeStringArray, $array, $op$(, $arg)*) + } + DataType::Binary => downcast_dict_op!(key, BinaryArray, $array, $op$(, $arg)*), + DataType::LargeBinary => { + downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*) + } + d => unreachable!("cannot downcast {} dictionary value to byte array", d), + }, + d => unreachable!("cannot downcast {} to byte array", d), } }; } -/// Returns an [`ArrayWriter`] for byte or string arrays -pub(super) fn make_byte_array_writer<'a>( - descr: ColumnDescPtr, - data_type: DataType, - props: WriterPropertiesPtr, - page_writer: Box, - on_close: OnCloseColumnChunk<'a>, -) -> Box { - Box::new(ByteArrayWriter { - writer: Some(GenericColumnWriter::new(descr, props, page_writer)), - on_close: Some(on_close), - data_type, - }) -} - -/// An [`ArrayWriter`] for [`ByteArray`] -struct ByteArrayWriter<'a> { - writer: Option>, +/// A writer for byte array types +pub(super) struct ByteArrayWriter<'a> { + writer: GenericColumnWriter<'a, ByteArrayEncoder>, on_close: Option>, - data_type: DataType, } -impl<'a> ArrayWriter for ByteArrayWriter<'a> { - fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { - self.writer.as_mut().unwrap().write_batch_internal( +impl<'a> ByteArrayWriter<'a> { + /// Returns a new [`ByteArrayWriter`] + pub fn new( + descr: ColumnDescPtr, + props: &'a WriterPropertiesPtr, + page_writer: Box, + on_close: OnCloseColumnChunk<'a>, + ) -> Result { + Ok(Self { + writer: GenericColumnWriter::new(descr, props.clone(), page_writer), + on_close: Some(on_close), + }) + } + + pub fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { + self.writer.write_batch_internal( array, Some(levels.non_null_indices()), levels.def_levels(), @@ -92,11 +126,11 @@ impl<'a> ArrayWriter for ByteArrayWriter<'a> { Ok(()) } - fn close(&mut self) -> Result<()> { + pub fn close(self) -> Result<()> { let (bytes_written, rows_written, metadata, column_index, offset_index) = - self.writer.take().unwrap().close()?; + self.writer.close()?; - if let Some(on_close) = self.on_close.take() { + if let Some(on_close) = self.on_close { on_close( bytes_written, rows_written, diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 49531d9724aa..08f37c395658 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,7 +23,6 @@ use std::sync::Arc; use arrow::array as arrow_array; use arrow::array::ArrayRef; -use arrow::array::BasicDecimalArray; use arrow::datatypes::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::Array; @@ -33,70 +32,18 @@ use super::schema::{ decimal_length_from_precision, }; -use crate::column::writer::{get_column_writer, ColumnWriter, ColumnWriterImpl}; +use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; +use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::RowGroupMetaDataPtr; use crate::file::properties::WriterProperties; -use crate::file::writer::{SerializedColumnWriter, SerializedRowGroupWriter}; +use crate::file::writer::SerializedRowGroupWriter; use crate::{data_type::*, file::writer::SerializedFileWriter}; use levels::{calculate_array_levels, LevelInfo}; mod byte_array; mod levels; -/// An object-safe API for writing an [`ArrayRef`] -trait ArrayWriter { - fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()>; - - fn close(&mut self) -> Result<()>; -} - -/// Fallback implementation for writing an [`ArrayRef`] that uses [`SerializedColumnWriter`] -struct ColumnArrayWriter<'a>(Option>); - -impl<'a> ArrayWriter for ColumnArrayWriter<'a> { - fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { - write_leaf(self.0.as_mut().unwrap().untyped(), array, levels)?; - Ok(()) - } - - fn close(&mut self) -> Result<()> { - self.0.take().unwrap().close() - } -} - -fn get_writer<'a, W: Write>( - row_group_writer: &'a mut SerializedRowGroupWriter<'_, W>, - data_type: &ArrowDataType, -) -> Result> { - let array_writer = row_group_writer - .next_column_with_factory( - |descr, props, page_writer, on_close| match data_type { - ArrowDataType::Utf8 - | ArrowDataType::LargeUtf8 - | ArrowDataType::Binary - | ArrowDataType::LargeBinary => Ok(byte_array::make_byte_array_writer( - descr, - data_type.clone(), - props.clone(), - page_writer, - on_close, - )), - _ => { - let column_writer = - get_column_writer(descr, props.clone(), page_writer); - - let serialized_writer = - SerializedColumnWriter::new(column_writer, Some(on_close)); - - Ok(Box::new(ColumnArrayWriter(Some(serialized_writer)))) - } - }, - )? - .expect("Unable to get column writer"); - Ok(array_writer) -} - /// Arrow writer /// /// Writes Arrow `RecordBatch`es to a Parquet writer, buffering up `RecordBatch` in order @@ -314,22 +261,24 @@ fn write_leaves( | ArrowDataType::Time64(_) | ArrowDataType::Duration(_) | ArrowDataType::Interval(_) - | ArrowDataType::LargeBinary - | ArrowDataType::Binary - | ArrowDataType::Utf8 - | ArrowDataType::LargeUtf8 | ArrowDataType::Decimal128(_, _) | ArrowDataType::Decimal256(_, _) | ArrowDataType::FixedSizeBinary(_) => { - let mut writer = get_writer(row_group_writer, &data_type)?; + let mut col_writer = row_group_writer.next_column()?.unwrap(); for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - writer.write( - array, - levels.pop().expect("Levels exhausted"), - )?; + write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; } - writer.close()?; - Ok(()) + col_writer.close() + } + ArrowDataType::LargeBinary + | ArrowDataType::Binary + | ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 => { + let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + col_writer.write(array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() } ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let arrays: Vec<_> = arrays.iter().map(|array|{ @@ -380,18 +329,21 @@ fn write_leaves( write_leaves(row_group_writer, &values, levels)?; Ok(()) } - ArrowDataType::Dictionary(_, value_type) => { - let mut writer = get_writer(row_group_writer, value_type)?; - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - // cast dictionary to a primitive - let array = arrow::compute::cast(array, value_type)?; - writer.write( - &array, - levels.pop().expect("Levels exhausted"), - )?; + ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Binary | ArrowDataType::LargeBinary => { + let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + col_writer.write(array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() + } + _ => { + let mut col_writer = row_group_writer.next_column()?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() } - writer.close()?; - Ok(()) } ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 640d1b81f827..5c186d7aa769 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -86,6 +86,7 @@ use std::task::{Context, Poll}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; +use futures::ready; use futures::stream::Stream; use parquet_format::{PageHeader, PageType}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; @@ -94,7 +95,9 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; -use crate::arrow::arrow_reader::ParquetRecordBatchReader; +use crate::arrow::arrow_reader::{ + evaluate_predicate, ParquetRecordBatchReader, RowFilter, RowSelection, +}; use crate::arrow::schema::parquet_to_arrow_schema; use crate::arrow::ProjectionMask; use crate::basic::Compression; @@ -102,13 +105,13 @@ use crate::column::page::{Page, PageIterator, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; -use crate::file::metadata::ParquetMetaData; +use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; use crate::file::serialized_reader::{decode_page, read_page_header}; use crate::file::FOOTER_SIZE; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor}; /// The asynchronous interface used by [`ParquetRecordBatchStream`] to read parquet files -pub trait AsyncFileReader { +pub trait AsyncFileReader: Send { /// Retrieve the bytes in `range` fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result>; @@ -116,10 +119,7 @@ pub trait AsyncFileReader { fn get_byte_ranges( &mut self, ranges: Vec>, - ) -> BoxFuture<'_, Result>> - where - Self: Send, - { + ) -> BoxFuture<'_, Result>> { async move { let mut result = Vec::with_capacity(ranges.len()); @@ -139,6 +139,23 @@ pub trait AsyncFileReader { fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; } +impl AsyncFileReader for Box { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { + self.as_mut().get_bytes(range) + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, Result>> { + self.as_mut().get_byte_ranges(ranges) + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + self.as_mut().get_metadata() + } +} + impl AsyncFileReader for T { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { async move { @@ -195,9 +212,13 @@ pub struct ParquetRecordBatchStreamBuilder { row_groups: Option>, projection: ProjectionMask, + + filter: Option, + + selection: Option, } -impl ParquetRecordBatchStreamBuilder { +impl ParquetRecordBatchStreamBuilder { /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file pub async fn new(mut input: T) -> Result { let metadata = input.get_metadata().await?; @@ -214,6 +235,8 @@ impl ParquetRecordBatchStreamBuilder { batch_size: 1024, row_groups: None, projection: ProjectionMask::all(), + filter: None, + selection: None, }) } @@ -253,6 +276,32 @@ impl ParquetRecordBatchStreamBuilder { } } + /// Provide a [`RowSelection] to filter out rows, and avoid fetching their + /// data into memory + /// + /// Row group filtering is applied prior to this, and rows from skipped + /// row groups should not be included in the [`RowSelection`] + /// + /// TODO: Make public once stable (#1792) + #[allow(unused)] + pub(crate) fn with_row_selection(self, selection: RowSelection) -> Self { + Self { + selection: Some(selection), + ..self + } + } + + /// Provide a [`RowFilter`] to skip decoding rows + /// + /// TODO: Make public once stable (#1792) + #[allow(unused)] + pub(crate) fn with_row_filter(self, filter: RowFilter) -> Self { + Self { + filter: Some(filter), + ..self + } + } + /// Build a new [`ParquetRecordBatchStream`] pub fn build(self) -> Result> { let num_row_groups = self.metadata.row_groups().len(); @@ -271,25 +320,122 @@ impl ParquetRecordBatchStreamBuilder { None => (0..self.metadata.row_groups().len()).collect(), }; + let reader = ReaderFactory { + input: self.input, + filter: self.filter, + metadata: self.metadata.clone(), + schema: self.schema.clone(), + }; + Ok(ParquetRecordBatchStream { + metadata: self.metadata, + batch_size: self.batch_size, row_groups, projection: self.projection, - batch_size: self.batch_size, - metadata: self.metadata, + selection: self.selection, schema: self.schema, - input: Some(self.input), + reader: Some(reader), state: StreamState::Init, }) } } +type ReadResult = Result<(ReaderFactory, Option)>; + +/// [`ReaderFactory`] is used by [`ParquetRecordBatchStream`] to create +/// [`ParquetRecordBatchReader`] +struct ReaderFactory { + metadata: Arc, + + schema: SchemaRef, + + input: T, + + filter: Option, +} + +impl ReaderFactory +where + T: AsyncFileReader + Send, +{ + /// Reads the next row group with the provided `selection`, `projection` and `batch_size` + /// + /// Note: this captures self so that the resulting future has a static lifetime + async fn read_row_group( + mut self, + row_group_idx: usize, + mut selection: Option, + projection: ProjectionMask, + batch_size: usize, + ) -> ReadResult { + // TODO: calling build_array multiple times is wasteful + let selects_any = |selection: Option<&RowSelection>| { + selection.map(|x| x.selects_any()).unwrap_or(true) + }; + + let meta = self.metadata.row_group(row_group_idx); + let mut row_group = InMemoryRowGroup { + schema: meta.schema_descr_ptr(), + row_count: meta.num_rows() as usize, + column_chunks: vec![None; meta.columns().len()], + }; + + if let Some(filter) = self.filter.as_mut() { + for predicate in filter.predicates.iter_mut() { + if !selects_any(selection.as_ref()) { + return Ok((self, None)); + } + + let predicate_projection = predicate.projection().clone(); + row_group + .fetch( + &mut self.input, + meta, + &predicate_projection, + selection.as_ref(), + ) + .await?; + + let array_reader = build_array_reader( + self.schema.clone(), + predicate_projection, + &row_group, + )?; + + selection = Some(evaluate_predicate( + batch_size, + array_reader, + selection, + predicate.as_mut(), + )?); + } + } + + if !selects_any(selection.as_ref()) { + return Ok((self, None)); + } + + row_group + .fetch(&mut self.input, meta, &projection, selection.as_ref()) + .await?; + + let reader = ParquetRecordBatchReader::new( + batch_size, + build_array_reader(self.schema.clone(), projection, &row_group)?, + selection, + ); + + Ok((self, Some(reader))) + } +} + enum StreamState { /// At the start of a new row group, or the end of the parquet stream Init, /// Decoding a batch Decoding(ParquetRecordBatchReader), /// Reading data from input - Reading(BoxFuture<'static, Result<(T, InMemoryRowGroup)>>), + Reading(BoxFuture<'static, ReadResult>), /// Error Error, } @@ -305,20 +451,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`] of [`RecordBatch`] for a parquet file +/// An asynchronous [`Stream`] of [`RecordBatch`] for a parquet file that can be +/// constructed using [`ParquetRecordBatchStreamBuilder`] pub struct ParquetRecordBatchStream { metadata: Arc, schema: SchemaRef, - batch_size: usize, + row_groups: VecDeque, projection: ProjectionMask, - row_groups: VecDeque, + batch_size: usize, + + selection: Option, /// This is an option so it can be moved into a future - input: Option, + reader: Option>, state: StreamState, } @@ -370,101 +519,40 @@ where None => return Poll::Ready(None), }; - let metadata = self.metadata.clone(); - let mut input = match self.input.take() { - Some(input) => input, - None => { - self.state = StreamState::Error; - return Poll::Ready(Some(Err(general_err!( - "input stream lost" - )))); - } - }; - - let projection = self.projection.clone(); - self.state = StreamState::Reading( - async move { - let row_group_metadata = metadata.row_group(row_group_idx); - let mut column_chunks = - vec![None; row_group_metadata.columns().len()]; - - // TODO: Combine consecutive ranges - let fetch_ranges = (0..column_chunks.len()) - .into_iter() - .filter_map(|idx| { - if !projection.leaf_included(idx) { - None - } else { - let column = row_group_metadata.column(idx); - let (start, length) = column.byte_range(); - - Some(start as usize..(start + length) as usize) - } - }) - .collect(); - - let mut chunk_data = - input.get_byte_ranges(fetch_ranges).await?.into_iter(); - - for (idx, chunk) in column_chunks.iter_mut().enumerate() { - if !projection.leaf_included(idx) { - continue; - } - - let column = row_group_metadata.column(idx); - - if let Some(data) = chunk_data.next() { - *chunk = Some(InMemoryColumnChunk { - num_values: column.num_values(), - compression: column.compression(), - physical_type: column.column_type(), - data, - }); - } - } - - Ok(( - input, - InMemoryRowGroup { - schema: metadata.file_metadata().schema_descr_ptr(), - row_count: row_group_metadata.num_rows() as usize, - column_chunks, - }, - )) - } - .boxed(), - ) - } - StreamState::Reading(f) => { - let result = futures::ready!(f.poll_unpin(cx)); - self.state = StreamState::Init; - - let row_group: Box = match result { - Ok((input, row_group)) => { - self.input = Some(input); - Box::new(row_group) - } - Err(e) => { - self.state = StreamState::Error; - return Poll::Ready(Some(Err(e))); - } - }; + let reader = self.reader.take().expect("lost reader"); - let parquet_schema = self.metadata.file_metadata().schema_descr_ptr(); + let row_count = + self.metadata.row_group(row_group_idx).num_rows() as usize; - let array_reader = build_array_reader( - parquet_schema, - self.schema.clone(), - self.projection.clone(), - row_group, - )?; + let selection = + self.selection.as_mut().map(|s| s.split_off(row_count)); - let batch_reader = - ParquetRecordBatchReader::try_new(self.batch_size, array_reader) - .expect("reader"); + let fut = reader + .read_row_group( + row_group_idx, + selection, + self.projection.clone(), + self.batch_size, + ) + .boxed(); - self.state = StreamState::Decoding(batch_reader) + self.state = StreamState::Reading(fut) } + StreamState::Reading(f) => match ready!(f.poll_unpin(cx)) { + Ok((reader_factory, maybe_reader)) => { + self.reader = Some(reader_factory); + match maybe_reader { + // Read records from [`ParquetRecordBatchReader`] + Some(reader) => self.state = StreamState::Decoding(reader), + // All rows skipped, read next row group + None => self.state = StreamState::Init, + } + } + Err(e) => { + self.state = StreamState::Error; + return Poll::Ready(Some(Err(e))); + } + }, StreamState::Error => return Poll::Pending, } } @@ -478,9 +566,56 @@ struct InMemoryRowGroup { row_count: usize, } +impl InMemoryRowGroup { + /// Fetches the necessary column data into memory + async fn fetch( + &mut self, + input: &mut T, + metadata: &RowGroupMetaData, + projection: &ProjectionMask, + _selection: Option<&RowSelection>, + ) -> Result<()> { + // TODO: Use OffsetIndex and selection to prune pages + + let fetch_ranges = self + .column_chunks + .iter() + .enumerate() + .into_iter() + .filter_map(|(idx, chunk)| { + (chunk.is_none() && projection.leaf_included(idx)).then(|| { + let column = metadata.column(idx); + let (start, length) = column.byte_range(); + start as usize..(start + length) as usize + }) + }) + .collect(); + + let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter(); + + for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { + if chunk.is_some() || !projection.leaf_included(idx) { + continue; + } + + let column = metadata.column(idx); + + if let Some(data) = chunk_data.next() { + *chunk = Some(InMemoryColumnChunk { + num_values: column.num_values(), + compression: column.compression(), + physical_type: column.column_type(), + data, + }); + } + } + Ok(()) + } +} + impl RowGroupCollection for InMemoryRowGroup { - fn schema(&self) -> Result { - Ok(self.schema.clone()) + fn schema(&self) -> SchemaDescPtr { + self.schema.clone() } fn num_rows(&self) -> usize { @@ -671,7 +806,10 @@ impl PageIterator for ColumnChunkIterator { #[cfg(test)] mod tests { use super::*; - use crate::arrow::{ArrowReader, ParquetFileArrowReader}; + use crate::arrow::arrow_reader::ArrowPredicateFn; + use crate::arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}; + use crate::file::footer::parse_metadata; + use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; use arrow::error::Result as ArrowResult; use futures::TryStreamExt; use std::sync::Mutex; @@ -844,4 +982,73 @@ mod tests { assert_eq!(second_page.page_type(), crate::basic::PageType::DATA_PAGE); assert_eq!(second_page.num_values(), 8); } + + #[tokio::test] + async fn test_row_filter() { + let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]); + let b = StringArray::from_iter_values(["1", "2", "3", "4", "5", "6"]); + let c = Int32Array::from_iter(0..6); + let data = RecordBatch::try_from_iter([ + ("a", Arc::new(a) as ArrayRef), + ("b", Arc::new(b) as ArrayRef), + ("c", Arc::new(c) as ArrayRef), + ]) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, data.schema(), None).unwrap(); + writer.write(&data).unwrap(); + writer.close().unwrap(); + + let data: Bytes = buf.into(); + let metadata = parse_metadata(&data).unwrap(); + let parquet_schema = metadata.file_metadata().schema_descr_ptr(); + + let test = TestReader { + data, + metadata: Arc::new(metadata), + requests: Default::default(), + }; + let requests = test.requests.clone(); + + let a_filter = ArrowPredicateFn::new( + ProjectionMask::leaves(&parquet_schema, vec![0]), + |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "b"), + ); + + let b_filter = ArrowPredicateFn::new( + ProjectionMask::leaves(&parquet_schema, vec![1]), + |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "4"), + ); + + let filter = RowFilter::new(vec![Box::new(a_filter), Box::new(b_filter)]); + + let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]); + let stream = ParquetRecordBatchStreamBuilder::new(test) + .await + .unwrap() + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_row_filter(filter) + .build() + .unwrap(); + + let batches: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 2); + + let col = batch.column(0); + let val = col.as_any().downcast_ref::().unwrap().value(0); + assert_eq!(val, "b"); + + let col = batch.column(1); + let val = col.as_any().downcast_ref::().unwrap().value(0); + assert_eq!(val, 3); + + // Should only have made 3 requests + assert_eq!(requests.lock().unwrap().len(), 3); + } } diff --git a/parquet/src/arrow/buffer/converter.rs b/parquet/src/arrow/buffer/converter.rs index 4cd0589424fc..aeca548bde72 100644 --- a/parquet/src/arrow/buffer/converter.rs +++ b/parquet/src/arrow/buffer/converter.rs @@ -17,18 +17,18 @@ use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; use arrow::array::{ - Array, ArrayRef, BasicDecimalArray, BinaryArray, BinaryBuilder, Decimal128Array, - FixedSizeBinaryArray, FixedSizeBinaryBuilder, IntervalDayTimeArray, - IntervalDayTimeBuilder, IntervalYearMonthArray, IntervalYearMonthBuilder, - LargeBinaryArray, LargeBinaryBuilder, LargeStringArray, LargeStringBuilder, - StringArray, StringBuilder, TimestampNanosecondArray, + Array, ArrayRef, Decimal128Array, FixedSizeBinaryArray, FixedSizeBinaryBuilder, + IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalYearMonthArray, + IntervalYearMonthBuilder, TimestampNanosecondArray, }; -use std::convert::{From, TryInto}; use std::sync::Arc; use crate::errors::Result; use std::marker::PhantomData; +#[cfg(test)] +use arrow::array::{StringArray, StringBuilder}; + /// A converter is used to consume record reader's content and convert it to arrow /// primitive array. pub trait Converter { @@ -185,8 +185,10 @@ impl Converter>, TimestampNanosecondArray> for Int96ArrayConve } } +#[cfg(test)] pub struct Utf8ArrayConverter {} +#[cfg(test)] impl Converter>, StringArray> for Utf8ArrayConverter { fn convert(&self, source: Vec>) -> Result { let data_size = source @@ -206,70 +208,9 @@ impl Converter>, StringArray> for Utf8ArrayConverter { } } -pub struct LargeUtf8ArrayConverter {} - -impl Converter>, LargeStringArray> for LargeUtf8ArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let data_size = source - .iter() - .map(|x| x.as_ref().map(|b| b.len()).unwrap_or(0)) - .sum(); - - let mut builder = LargeStringBuilder::with_capacity(source.len(), data_size); - for v in source { - match v { - Some(array) => builder.append_value(array.as_utf8()?), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - -pub struct BinaryArrayConverter {} - -impl Converter>, BinaryArray> for BinaryArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let mut builder = BinaryBuilder::new(source.len()); - for v in source { - match v { - Some(array) => builder.append_value(array.data()), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - -pub struct LargeBinaryArrayConverter {} - -impl Converter>, LargeBinaryArray> for LargeBinaryArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let mut builder = LargeBinaryBuilder::new(source.len()); - for v in source { - match v { - Some(array) => builder.append_value(array.data()), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - +#[cfg(test)] pub type Utf8Converter = ArrayRefConverter>, StringArray, Utf8ArrayConverter>; -pub type LargeUtf8Converter = - ArrayRefConverter>, LargeStringArray, LargeUtf8ArrayConverter>; -pub type BinaryConverter = - ArrayRefConverter>, BinaryArray, BinaryArrayConverter>; -pub type LargeBinaryConverter = ArrayRefConverter< - Vec>, - LargeBinaryArray, - LargeBinaryArrayConverter, ->; pub type Int96Converter = ArrayRefConverter>, TimestampNanosecondArray, Int96ArrayConverter>; @@ -299,32 +240,6 @@ pub type DecimalFixedLengthByteArrayConverter = ArrayRefConverter< pub type DecimalByteArrayConvert = ArrayRefConverter>, Decimal128Array, DecimalArrayConverter>; -pub struct FromConverter { - _source: PhantomData, - _dest: PhantomData, -} - -impl FromConverter -where - T: From, -{ - pub fn new() -> Self { - Self { - _source: PhantomData, - _dest: PhantomData, - } - } -} - -impl Converter for FromConverter -where - T: From, -{ - fn convert(&self, source: S) -> Result { - Ok(T::from(source)) - } -} - pub struct ArrayRefConverter { _source: PhantomData, _array: PhantomData, diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index b64b2946b91a..ae9e3590de3f 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -49,6 +49,7 @@ impl Default for DictionaryBuffer { impl DictionaryBuffer { + #[allow(unused)] pub fn len(&self) -> usize { match self { Self::Dict { keys, .. } => keys.len(), diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index b68f59d514f2..18b4c9e07026 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -198,12 +198,6 @@ where self.num_records += buffered_records; self.num_values += buffered_values; - self.consume_def_levels(); - self.consume_rep_levels(); - self.consume_record_data(); - self.consume_bitmap(); - self.reset(); - let remaining = num_records - buffered_records; if remaining == 0 { @@ -220,6 +214,7 @@ where } /// Returns number of records stored in buffer. + #[allow(unused)] pub fn num_records(&self) -> usize { self.num_records } @@ -279,11 +274,6 @@ where .map(|levels| levels.split_bitmask(self.num_values)) } - /// Returns column reader. - pub(crate) fn column_reader(&self) -> Option<&ColumnReader> { - self.column_reader.as_ref() - } - /// Try to read one batch of data. fn read_one_batch(&mut self, batch_size: usize) -> Result { let rep_levels = self @@ -796,4 +786,186 @@ mod tests { assert_eq!(record_reader.num_records(), 8); assert_eq!(record_reader.num_values(), 14); } + + #[test] + fn test_skip_required_records() { + // Construct column schema + let message_type = " + message test_schema { + REQUIRED INT32 leaf; + } + "; + let desc = parse_message_type(message_type) + .map(|t| SchemaDescriptor::new(Arc::new(t))) + .map(|s| s.column(0)) + .unwrap(); + + // Construct record reader + let mut record_reader = RecordReader::::new(desc.clone()); + + // First page + + // Records data: + // test_schema + // leaf: 4 + // test_schema + // leaf: 7 + // test_schema + // leaf: 6 + // test_schema + // left: 3 + // test_schema + // left: 2 + { + let values = [4, 7, 6, 3, 2]; + let mut pb = DataPageBuilderImpl::new(desc.clone(), 5, true); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(2).unwrap()); + assert_eq!(0, record_reader.num_records()); + assert_eq!(0, record_reader.num_values()); + assert_eq!(3, record_reader.read_records(3).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + } + + // Second page + + // Records data: + // test_schema + // leaf: 8 + // test_schema + // leaf: 9 + { + let values = [8, 9]; + let mut pb = DataPageBuilderImpl::new(desc, 2, true); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(10).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + assert_eq!(0, record_reader.read_records(10).unwrap()); + } + + let mut bb = Int32BufferBuilder::new(3); + bb.append_slice(&[6, 3, 2]); + let expected_buffer = bb.finish(); + assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!(None, record_reader.consume_def_levels()); + assert_eq!(None, record_reader.consume_bitmap()); + } + + #[test] + fn test_skip_optional_records() { + // Construct column schema + let message_type = " + message test_schema { + OPTIONAL Group test_struct { + OPTIONAL INT32 leaf; + } + } + "; + + let desc = parse_message_type(message_type) + .map(|t| SchemaDescriptor::new(Arc::new(t))) + .map(|s| s.column(0)) + .unwrap(); + + // Construct record reader + let mut record_reader = RecordReader::::new(desc.clone()); + + // First page + + // Records data: + // test_schema + // test_struct + // test_schema + // test_struct + // leaf: 7 + // test_schema + // test_schema + // test_struct + // leaf: 6 + // test_schema + // test_struct + // leaf: 6 + { + let values = [7, 6, 3]; + //empty, non-empty, empty, non-empty, non-empty + let def_levels = [1i16, 2i16, 0i16, 2i16, 2i16]; + let mut pb = DataPageBuilderImpl::new(desc.clone(), 5, true); + pb.add_def_levels(2, &def_levels); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(2).unwrap()); + assert_eq!(0, record_reader.num_records()); + assert_eq!(0, record_reader.num_values()); + assert_eq!(3, record_reader.read_records(3).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + } + + // Second page + + // Records data: + // test_schema + // test_schema + // test_struct + // left: 8 + { + let values = [8]; + //empty, non-empty + let def_levels = [0i16, 2i16]; + let mut pb = DataPageBuilderImpl::new(desc, 2, true); + pb.add_def_levels(2, &def_levels); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(10).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + assert_eq!(0, record_reader.read_records(10).unwrap()); + } + + // Verify result def levels + let mut bb = Int16BufferBuilder::new(7); + bb.append_slice(&[0i16, 2i16, 2i16]); + let expected_def_levels = bb.finish(); + assert_eq!( + Some(expected_def_levels), + record_reader.consume_def_levels() + ); + + // Verify bitmap + let expected_valid = &[false, true, true]; + let expected_buffer = Buffer::from_iter(expected_valid.iter().cloned()); + let expected_bitmap = Bitmap::from(expected_buffer); + assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + + // Verify result record data + let actual = record_reader.consume_record_data(); + let actual_values = actual.typed_data::(); + + let expected = &[0, 6, 3]; + assert_eq!(actual_values.len(), expected.len()); + + // Only validate valid values are equal + let iter = expected_valid.iter().zip(actual_values).zip(expected); + for ((valid, actual), expected) in iter { + if *valid { + assert_eq!(actual, expected) + } + } + } } diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 2cb47bc00e7e..01aefcd48e1d 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -73,7 +73,7 @@ pub fn parquet_to_arrow_schema_by_columns( // Add the Arrow metadata to the Parquet metadata skipping keys that collide if let Some(arrow_schema) = &maybe_schema { arrow_schema.metadata().iter().for_each(|(k, v)| { - metadata.entry(k.clone()).or_insert(v.clone()); + metadata.entry(k.clone()).or_insert_with(|| v.clone()); }); } @@ -100,7 +100,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { Ok(message) => message .header_as_schema() .map(arrow::ipc::convert::fb_to_schema) - .ok_or(arrow_err!("the message is not Arrow Schema")), + .ok_or_else(|| arrow_err!("the message is not Arrow Schema")), Err(err) => { // The flatbuffers implementation returns an error on verification error. Err(arrow_err!( diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 0cf1d5121b7e..7adbc8c1b6d0 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -18,7 +18,7 @@ //! Contains Rust mappings for Thrift definition. //! Refer to `parquet.thrift` file to see raw definitions. -use std::{convert, fmt, result, str}; +use std::{fmt, result, str}; use parquet_format as parquet; @@ -42,6 +42,7 @@ pub use parquet_format::{ /// For example INT16 is not included as a type since a good encoding of INT32 /// would handle this. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[allow(non_camel_case_types)] pub enum Type { BOOLEAN, INT32, @@ -62,7 +63,8 @@ pub enum Type { /// /// This struct was renamed from `LogicalType` in version 4.0.0. /// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum ConvertedType { NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. @@ -163,7 +165,7 @@ pub enum ConvertedType { /// This is an *entirely new* struct as of version /// 4.0.0. The struct previously named `LogicalType` was renamed to /// [`ConvertedType`]. Please see the README.md for more details. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum LogicalType { String, Map, @@ -196,7 +198,8 @@ pub enum LogicalType { // Mirrors `parquet::FieldRepetitionType` /// Representation of field types in schema. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum Repetition { /// Field is required (can not be null) and each record has exactly 1 value. REQUIRED, @@ -213,6 +216,7 @@ pub enum Repetition { /// Not all encodings are valid for all types. These enums are also used to specify the /// encoding of definition and repetition levels. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[allow(non_camel_case_types)] pub enum Encoding { /// Default byte encoding. /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. @@ -277,7 +281,7 @@ pub enum Encoding { // Mirrors `parquet::CompressionCodec` /// Supported compression algorithms. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Compression { UNCOMPRESSED, SNAPPY, @@ -293,7 +297,8 @@ pub enum Compression { /// Available data pages for Parquet file format. /// Note that some of the page types may not be supported. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum PageType { DATA_PAGE, INDEX_PAGE, @@ -312,7 +317,8 @@ pub enum PageType { /// /// See reference in /// -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum SortOrder { /// Signed (either value or legacy byte-wise) comparison. SIGNED, @@ -327,7 +333,8 @@ pub enum SortOrder { /// /// If column order is undefined, then it is the legacy behaviour and all values should /// be compared as signed values/bytes. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum ColumnOrder { /// Column uses the order defined by its logical or physical type /// (if there is no logical type), parquet-format 2.4.0+. @@ -489,7 +496,7 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl convert::From for Type { +impl From for Type { fn from(value: parquet::Type) -> Self { match value { parquet::Type::Boolean => Type::BOOLEAN, @@ -504,7 +511,7 @@ impl convert::From for Type { } } -impl convert::From for parquet::Type { +impl From for parquet::Type { fn from(value: Type) -> Self { match value { Type::BOOLEAN => parquet::Type::Boolean, @@ -522,7 +529,7 @@ impl convert::From for parquet::Type { // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl convert::From> for ConvertedType { +impl From> for ConvertedType { fn from(option: Option) -> Self { match option { None => ConvertedType::NONE, @@ -558,7 +565,7 @@ impl convert::From> for ConvertedType { } } -impl convert::From for Option { +impl From for Option { fn from(value: ConvertedType) -> Self { match value { ConvertedType::NONE => None, @@ -595,7 +602,7 @@ impl convert::From for Option { // ---------------------------------------------------------------------- // parquet::LogicalType <=> LogicalType conversion -impl convert::From for LogicalType { +impl From for LogicalType { fn from(value: parquet::LogicalType) -> Self { match value { parquet::LogicalType::STRING(_) => LogicalType::String, @@ -627,7 +634,7 @@ impl convert::From for LogicalType { } } -impl convert::From for parquet::LogicalType { +impl From for parquet::LogicalType { fn from(value: LogicalType) -> Self { match value { LogicalType::String => parquet::LogicalType::STRING(Default::default()), @@ -723,7 +730,7 @@ impl From> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl convert::From for Repetition { +impl From for Repetition { fn from(value: parquet::FieldRepetitionType) -> Self { match value { parquet::FieldRepetitionType::Required => Repetition::REQUIRED, @@ -733,7 +740,7 @@ impl convert::From for Repetition { } } -impl convert::From for parquet::FieldRepetitionType { +impl From for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { Repetition::REQUIRED => parquet::FieldRepetitionType::Required, @@ -746,7 +753,7 @@ impl convert::From for parquet::FieldRepetitionType { // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl convert::From for Encoding { +impl From for Encoding { fn from(value: parquet::Encoding) -> Self { match value { parquet::Encoding::Plain => Encoding::PLAIN, @@ -762,7 +769,7 @@ impl convert::From for Encoding { } } -impl convert::From for parquet::Encoding { +impl From for parquet::Encoding { fn from(value: Encoding) -> Self { match value { Encoding::PLAIN => parquet::Encoding::Plain, @@ -781,7 +788,7 @@ impl convert::From for parquet::Encoding { // ---------------------------------------------------------------------- // parquet::CompressionCodec <=> Compression conversion -impl convert::From for Compression { +impl From for Compression { fn from(value: parquet::CompressionCodec) -> Self { match value { parquet::CompressionCodec::Uncompressed => Compression::UNCOMPRESSED, @@ -795,7 +802,7 @@ impl convert::From for Compression { } } -impl convert::From for parquet::CompressionCodec { +impl From for parquet::CompressionCodec { fn from(value: Compression) -> Self { match value { Compression::UNCOMPRESSED => parquet::CompressionCodec::Uncompressed, @@ -812,7 +819,7 @@ impl convert::From for parquet::CompressionCodec { // ---------------------------------------------------------------------- // parquet::PageType <=> PageType conversion -impl convert::From for PageType { +impl From for PageType { fn from(value: parquet::PageType) -> Self { match value { parquet::PageType::DataPage => PageType::DATA_PAGE, @@ -823,7 +830,7 @@ impl convert::From for PageType { } } -impl convert::From for parquet::PageType { +impl From for parquet::PageType { fn from(value: PageType) -> Self { match value { PageType::DATA_PAGE => parquet::PageType::DataPage, diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index aa1d50563cd9..827aa7311f58 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -439,7 +439,7 @@ mod tests { // test default values assert_eq!(args.input_format, CsvDialect::Csv); assert_eq!(args.batch_size, 1000); - assert_eq!(args.has_header, false); + assert!(!args.has_header); assert_eq!(args.delimiter, None); assert_eq!(args.get_delimiter(), b','); assert_eq!(args.record_terminator, None); @@ -553,7 +553,7 @@ mod tests { Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); @@ -585,7 +585,7 @@ mod tests { Field::new("field4", DataType::Utf8, false), Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 0530afaa786a..927d96f8cde7 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -93,6 +93,6 @@ fn print_row(row: &Row, json: bool) { if json { println!("{}", row.to_json_value()) } else { - println!("{}", row.to_string()); + println!("{}", row); } } diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index b875b0e7102b..68c52def7c44 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -67,9 +67,9 @@ fn main() { println!("Metadata for file: {}", &filename); println!(); if verbose { - print_parquet_metadata(&mut std::io::stdout(), &metadata); + print_parquet_metadata(&mut std::io::stdout(), metadata); } else { - print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); + print_file_metadata(&mut std::io::stdout(), metadata.file_metadata()); } } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index c61e9c0b343e..1658797cee7d 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -174,6 +174,12 @@ pub struct PageWriteSpec { pub bytes_written: u64, } +impl Default for PageWriteSpec { + fn default() -> Self { + Self::new() + } +} + impl PageWriteSpec { /// Creates new spec with default page write metrics. pub fn new() -> Self { diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 8e0fa5a4d5aa..1432c72b53f1 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -28,7 +28,7 @@ use crate::column::reader::decoder::{ use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::bit_util::{ceil, num_required_bits}; +use crate::util::bit_util::{ceil, num_required_bits, read_num_bytes}; use crate::util::memory::ByteBufferPtr; pub(crate) mod decoder; @@ -520,7 +520,7 @@ fn parse_v1_level( match encoding { Encoding::RLE => { let i32_size = std::mem::size_of::(); - let data_size = read_num_bytes!(i32, i32_size, buf.as_ref()) as usize; + let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; Ok((i32_size + data_size, buf.range(i32_size, data_size))) } Encoding::BIT_PACKED => { @@ -544,8 +544,8 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::test_common::make_pages; use crate::util::test_common::page_util::InMemoryPageReader; + use crate::util::test_common::rand_gen::make_pages; const NUM_LEVELS: usize = 128; const NUM_PAGES: usize = 2; @@ -1231,6 +1231,7 @@ mod tests { // Helper function for the general case of `read_batch()` where `values`, // `def_levels` and `rep_levels` are always provided with enough space. + #[allow(clippy::too_many_arguments)] fn test_read_batch_general( &mut self, desc: ColumnDescPtr, @@ -1262,6 +1263,7 @@ mod tests { // Helper function to test `read_batch()` method with custom buffers for values, // definition and repetition levels. + #[allow(clippy::too_many_arguments)] fn test_read_batch( &mut self, desc: ColumnDescPtr, diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index d7363129f1ea..4fb4f210e146 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -168,7 +168,6 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { // Set either main encoder or fallback encoder. let encoder = get_encoder( - descr.clone(), props .encoding(descr.path()) .unwrap_or_else(|| fallback_encoding(T::get_physical_type(), props)), diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index ce773c19d52b..669cacee6460 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -153,6 +153,29 @@ type ColumnCloseResult = ( Option, ); +// Metrics per page +#[derive(Default)] +struct PageMetrics { + num_buffered_values: u32, + num_buffered_rows: u32, + num_page_nulls: u64, +} + +// Metrics per column writer +struct ColumnMetrics { + total_bytes_written: u64, + total_rows_written: u64, + total_uncompressed_size: u64, + total_compressed_size: u64, + total_num_values: u64, + dictionary_page_offset: Option, + data_page_offset: Option, + min_column_value: Option, + max_column_value: Option, + num_column_nulls: u64, + column_distinct_count: Option, +} + /// Typed column writer for a primitive column. pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl>; @@ -167,31 +190,13 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { compressor: Option>, encoder: E, - // Metrics per page - /// The number of values including nulls in the in-progress data page - num_buffered_values: u32, - /// The number of rows in the in-progress data page - num_buffered_rows: u32, - /// The number of nulls in the in-progress data page - num_page_nulls: u64, - + page_metrics: PageMetrics, // Metrics per column writer - total_bytes_written: u64, - total_rows_written: u64, - total_uncompressed_size: u64, - total_compressed_size: u64, - total_num_values: u64, - dictionary_page_offset: Option, - data_page_offset: Option, - min_column_value: Option, - max_column_value: Option, - num_column_nulls: u64, - column_distinct_count: Option, + column_metrics: ColumnMetrics, /// The order of encodings within the generated metadata does not impact its meaning, /// but we use a BTreeSet so that the output is deterministic encodings: BTreeSet, - // Reused buffers def_levels_sink: Vec, rep_levels_sink: Vec, @@ -226,29 +231,34 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { codec, compressor, encoder, - num_buffered_values: 0, - num_buffered_rows: 0, - num_page_nulls: 0, - total_bytes_written: 0, - total_rows_written: 0, - total_uncompressed_size: 0, - total_compressed_size: 0, - total_num_values: 0, - dictionary_page_offset: None, - data_page_offset: None, def_levels_sink: vec![], rep_levels_sink: vec![], data_pages: VecDeque::new(), - min_column_value: None, - max_column_value: None, - num_column_nulls: 0, - column_distinct_count: None, + page_metrics: PageMetrics { + num_buffered_values: 0, + num_buffered_rows: 0, + num_page_nulls: 0, + }, + column_metrics: ColumnMetrics { + total_bytes_written: 0, + total_rows_written: 0, + total_uncompressed_size: 0, + total_compressed_size: 0, + total_num_values: 0, + dictionary_page_offset: None, + data_page_offset: None, + min_column_value: None, + max_column_value: None, + num_column_nulls: 0, + column_distinct_count: None, + }, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, } } + #[allow(clippy::too_many_arguments)] pub(crate) fn write_batch_internal( &mut self, values: &E::Values, @@ -284,8 +294,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if self.statistics_enabled == EnabledStatistics::Chunk { match (min, max) { (Some(min), Some(max)) => { - update_min(&self.descr, min, &mut self.min_column_value); - update_max(&self.descr, max, &mut self.max_column_value); + update_min( + &self.descr, + min, + &mut self.column_metrics.min_column_value, + ); + update_max( + &self.descr, + max, + &mut self.column_metrics.max_column_value, + ); } (None, Some(_)) | (Some(_), None) => { panic!("min/max should be both set or both None") @@ -293,8 +311,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { (None, None) => { if let Some((min, max)) = self.encoder.min_max(values, value_indices) { - update_min(&self.descr, &min, &mut self.min_column_value); - update_max(&self.descr, &max, &mut self.max_column_value); + update_min( + &self.descr, + &min, + &mut self.column_metrics.min_column_value, + ); + update_max( + &self.descr, + &max, + &mut self.column_metrics.max_column_value, + ); } } }; @@ -302,9 +328,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // We can only set the distinct count if there are no other writes if self.encoder.num_values() == 0 { - self.column_distinct_count = distinct_count; + self.column_metrics.column_distinct_count = distinct_count; } else { - self.column_distinct_count = None; + self.column_metrics.column_distinct_count = None; } let mut values_offset = 0; @@ -385,19 +411,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Returns total number of bytes written by this column writer so far. /// This value is also returned when column writer is closed. pub fn get_total_bytes_written(&self) -> u64 { - self.total_bytes_written + self.column_metrics.total_bytes_written } /// Returns total number of rows written by this column writer so far. /// This value is also returned when column writer is closed. pub fn get_total_rows_written(&self) -> u64 { - self.total_rows_written + self.column_metrics.total_rows_written } /// Finalises writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } if self.encoder.has_dictionary() { @@ -417,8 +443,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; Ok(( - self.total_bytes_written, - self.total_rows_written, + self.column_metrics.total_bytes_written, + self.column_metrics.total_rows_written, metadata, column_index, offset_index, @@ -464,7 +490,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_to_write += 1; } else { // We must always compute this as it is used to populate v2 pages - self.num_page_nulls += 1 + self.page_metrics.num_page_nulls += 1 } } @@ -486,14 +512,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Count the occasions where we start a new row for &level in levels { - self.num_buffered_rows += (level == 0) as u32 + self.page_metrics.num_buffered_rows += (level == 0) as u32 } self.rep_levels_sink.extend_from_slice(levels); } else { // Each value is exactly one row. // Equals to the number of values, we count nulls as well. - self.num_buffered_rows += num_levels as u32; + self.page_metrics.num_buffered_rows += num_levels as u32; } match value_indices { @@ -504,7 +530,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { None => self.encoder.write(values, values_offset, values_to_write)?, } - self.num_buffered_values += num_levels as u32; + self.page_metrics.num_buffered_values += num_levels as u32; if self.should_add_data_page() { self.add_data_page()?; @@ -547,7 +573,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Prepares and writes dictionary and all data pages into page writer. fn dict_fallback(&mut self) -> Result<()> { // At this point we know that we need to fall back. - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } self.write_dictionary_page()?; @@ -558,7 +584,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Update the column index and offset index when adding the data page fn update_column_offset_index(&mut self, page_statistics: &Option) { // update the column index - let null_page = (self.num_buffered_rows as u64) == self.num_page_nulls; + let null_page = (self.page_metrics.num_buffered_rows as u64) + == self.page_metrics.num_page_nulls; // a page contains only null values, // and writers have to set the corresponding entries in min_values and max_values to byte[0] if null_page && self.column_index_builder.valid() { @@ -566,7 +593,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, &[0; 1], &[0; 1], - self.num_page_nulls as i64, + self.page_metrics.num_page_nulls as i64, ); } else if self.column_index_builder.valid() { // from page statistics @@ -580,7 +607,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, stat.min_bytes(), stat.max_bytes(), - self.num_page_nulls as i64, + self.page_metrics.num_page_nulls as i64, ); } } @@ -588,7 +615,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // update the offset index self.offset_index_builder - .append_row_count(self.num_buffered_rows as i64); + .append_row_count(self.page_metrics.num_buffered_rows as i64); } /// Adds data page. @@ -600,17 +627,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let max_def_level = self.descr.max_def_level(); let max_rep_level = self.descr.max_rep_level(); - self.num_column_nulls += self.num_page_nulls; + self.column_metrics.num_column_nulls += self.page_metrics.num_page_nulls; let page_statistics = match (values_data.min_value, values_data.max_value) { (Some(min), Some(max)) => { - update_min(&self.descr, &min, &mut self.min_column_value); - update_max(&self.descr, &max, &mut self.max_column_value); + update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); Some(Statistics::new( Some(min), Some(max), None, - self.num_page_nulls, + self.page_metrics.num_page_nulls, false, )) } @@ -655,7 +682,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let data_page = Page::DataPage { buf: ByteBufferPtr::new(buffer), - num_values: self.num_buffered_values, + num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, @@ -696,10 +723,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let data_page = Page::DataPageV2 { buf: ByteBufferPtr::new(buffer), - num_values: self.num_buffered_values, + num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, - num_nulls: self.num_page_nulls as u32, - num_rows: self.num_buffered_rows, + num_nulls: self.page_metrics.num_page_nulls as u32, + num_rows: self.page_metrics.num_buffered_rows, def_levels_byte_len: def_levels_byte_len as u32, rep_levels_byte_len: rep_levels_byte_len as u32, is_compressed: self.compressor.is_some(), @@ -718,14 +745,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } // Update total number of rows. - self.total_rows_written += self.num_buffered_rows as u64; + self.column_metrics.total_rows_written += + self.page_metrics.num_buffered_rows as u64; // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - self.num_buffered_values = 0; - self.num_buffered_rows = 0; - self.num_page_nulls = 0; + self.page_metrics = PageMetrics::default(); Ok(()) } @@ -735,7 +761,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { #[inline] fn flush_data_pages(&mut self) -> Result<()> { // Write all outstanding data to a new page. - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } @@ -748,12 +774,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Assembles and writes column chunk metadata. fn write_column_metadata(&mut self) -> Result { - let total_compressed_size = self.total_compressed_size as i64; - let total_uncompressed_size = self.total_uncompressed_size as i64; - let num_values = self.total_num_values as i64; - let dict_page_offset = self.dictionary_page_offset.map(|v| v as i64); + let total_compressed_size = self.column_metrics.total_compressed_size as i64; + let total_uncompressed_size = self.column_metrics.total_uncompressed_size as i64; + let num_values = self.column_metrics.total_num_values as i64; + let dict_page_offset = + self.column_metrics.dictionary_page_offset.map(|v| v as i64); // If data page offset is not set, then no pages have been written - let data_page_offset = self.data_page_offset.unwrap_or(0) as i64; + let data_page_offset = self.column_metrics.data_page_offset.unwrap_or(0) as i64; let file_offset = match dict_page_offset { Some(dict_offset) => dict_offset + total_compressed_size, @@ -772,10 +799,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if self.statistics_enabled != EnabledStatistics::None { let statistics = Statistics::new( - self.min_column_value.clone(), - self.max_column_value.clone(), - self.column_distinct_count, - self.num_column_nulls, + self.column_metrics.min_column_value.clone(), + self.column_metrics.max_column_value.clone(), + self.column_metrics.column_distinct_count, + self.column_metrics.num_column_nulls, false, ); builder = builder.set_statistics(statistics); @@ -860,33 +887,27 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Updates column writer metrics with each page metadata. #[inline] fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) { - self.total_uncompressed_size += page_spec.uncompressed_size as u64; - self.total_compressed_size += page_spec.compressed_size as u64; - self.total_num_values += page_spec.num_values as u64; - self.total_bytes_written += page_spec.bytes_written; + self.column_metrics.total_uncompressed_size += page_spec.uncompressed_size as u64; + self.column_metrics.total_compressed_size += page_spec.compressed_size as u64; + self.column_metrics.total_num_values += page_spec.num_values as u64; + self.column_metrics.total_bytes_written += page_spec.bytes_written; match page_spec.page_type { PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => { - if self.data_page_offset.is_none() { - self.data_page_offset = Some(page_spec.offset); + if self.column_metrics.data_page_offset.is_none() { + self.column_metrics.data_page_offset = Some(page_spec.offset); } } PageType::DICTIONARY_PAGE => { assert!( - self.dictionary_page_offset.is_none(), + self.column_metrics.dictionary_page_offset.is_none(), "Dictionary offset is already set" ); - self.dictionary_page_offset = Some(page_spec.offset); + self.column_metrics.dictionary_page_offset = Some(page_spec.offset); } _ => {} } } - - /// Returns reference to the underlying page writer. - /// This method is intended to use in tests only. - fn get_page_writer_ref(&self) -> &dyn PageWriter { - self.page_writer.as_ref() - } } fn update_min( @@ -1075,7 +1096,7 @@ mod tests { writer::SerializedPageWriter, }; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::{io::FileSource, test_common::random_numbers_range}; + use crate::util::{io::FileSource, test_common::rand_gen::random_numbers_range}; use super::*; @@ -2382,20 +2403,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns decimals column reader. - fn get_test_decimals_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_decimals_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns descriptor for Decimal type with primitive column. fn get_test_decimals_column_descr( max_def_level: i16, @@ -2430,20 +2437,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns column reader for UINT32 Column provided as ConvertedType only - fn get_test_unsigned_int_given_as_converted_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_converted_type_unsigned_integer_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns column descriptor for UINT32 Column provided as ConvertedType only fn get_test_converted_type_unsigned_integer_column_descr( max_def_level: i16, diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index a5e49360a28a..ee5141cbe140 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -329,7 +329,7 @@ pub use zstd_codec::*; mod tests { use super::*; - use crate::util::test_common::*; + use crate::util::test_common::rand_gen::random_bytes; fn test_roundtrip(c: CodecType, data: &[u8]) { let mut c1 = create_codec(c).unwrap().unwrap(); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 43c9a4238a71..7870ca36a6d4 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -565,7 +565,7 @@ impl AsBytes for str { pub(crate) mod private { use crate::encodings::decoding::PlainDecoderDetails; - use crate::util::bit_util::{BitReader, BitWriter}; + use crate::util::bit_util::{read_num_bytes, BitReader, BitWriter}; use crate::util::memory::ByteBufferPtr; use crate::basic::Type; @@ -574,8 +574,6 @@ pub(crate) mod private { use super::{ParquetError, Result, SliceAsBytes}; - pub type BitIndex = u64; - /// Sealed trait to start to remove specialisation from implementations /// /// This is done to force the associated value type to be unimplementable outside of this @@ -710,19 +708,6 @@ pub(crate) mod private { } } - /// Hopelessly unsafe function that emulates `num::as_ne_bytes` - /// - /// It is not recommended to use this outside of this private module as, while it - /// _should_ work for primitive values, it is little better than a transmutation - /// and can act as a backdoor into mis-interpreting types as arbitary byte slices - #[inline] - fn as_raw<'a, T>(value: *const T) -> &'a [u8] { - unsafe { - let value = value as *const u8; - std::slice::from_raw_parts(value, std::mem::size_of::()) - } - } - macro_rules! impl_from_raw { ($ty: ty, $physical_ty: expr, $self: ident => $as_i64: block) => { impl ParquetValueType for $ty { @@ -907,21 +892,6 @@ pub(crate) mod private { } } - // TODO - Why does macro importing fail? - /// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in - /// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't - /// compile. - /// This is copied and modified from byteorder crate. - macro_rules! read_num_bytes { - ($ty:ty, $size:expr, $src:expr) => {{ - assert!($size <= $src.len()); - let mut buffer = - <$ty as $crate::util::bit_util::FromBytes>::Buffer::default(); - buffer.as_mut()[..$size].copy_from_slice(&$src[..$size]); - <$ty>::from_ne_bytes(buffer) - }}; - } - impl ParquetValueType for super::ByteArray { const PHYSICAL_TYPE: Type = Type::BYTE_ARRAY; @@ -961,9 +931,9 @@ pub(crate) mod private { .as_mut() .expect("set_data should have been called"); let num_values = std::cmp::min(buffer.len(), decoder.num_values); - for i in 0..num_values { + for val_array in buffer.iter_mut().take(num_values) { let len: usize = - read_num_bytes!(u32, 4, data.start_from(decoder.start).as_ref()) + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::(); @@ -971,7 +941,7 @@ pub(crate) mod private { return Err(eof_err!("Not enough bytes to decode")); } - let val: &mut Self = buffer[i].as_mut_any().downcast_mut().unwrap(); + let val: &mut Self = val_array.as_mut_any().downcast_mut().unwrap(); val.set_data(data.range(decoder.start, len)); decoder.start += len; @@ -990,7 +960,7 @@ pub(crate) mod private { for _ in 0..num_values { let len: usize = - read_num_bytes!(u32, 4, data.start_from(decoder.start).as_ref()) + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::() + len; } diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 58aa592d1424..86941ffe0eeb 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -322,6 +322,12 @@ pub struct DictDecoder { num_values: usize, } +impl Default for DictDecoder { + fn default() -> Self { + Self::new() + } +} + impl DictDecoder { /// Creates new dictionary decoder. pub fn new() -> Self { @@ -394,6 +400,12 @@ pub struct RleValueDecoder { _phantom: PhantomData, } +impl Default for RleValueDecoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueDecoder { pub fn new() -> Self { Self { @@ -412,7 +424,7 @@ impl Decoder for RleValueDecoder { // We still need to remove prefix of i32 from the stream. const I32_SIZE: usize = mem::size_of::(); - let data_size = read_num_bytes!(i32, I32_SIZE, data.as_ref()) as usize; + let data_size = bit_util::read_num_bytes::(I32_SIZE, data.as_ref()) as usize; self.decoder = RleDecoder::new(1); self.decoder.set_data(data.range(I32_SIZE, data_size)); self.values_left = num_values; @@ -485,6 +497,15 @@ pub struct DeltaBitPackDecoder { last_value: T::T, } +impl Default for DeltaBitPackDecoder +where + T::T: Default + FromPrimitive + WrappingAdd + Copy, +{ + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackDecoder where T::T: Default + FromPrimitive + WrappingAdd + Copy, @@ -706,8 +727,6 @@ where Ok(to_read) } - - fn values_left(&self) -> usize { self.values_left } @@ -717,8 +736,61 @@ where } fn skip(&mut self, num_values: usize) -> Result { - let mut buffer = vec![T::T::default(); num_values]; - self.get(&mut buffer) + let mut skip = 0; + let to_skip = num_values.min(self.values_left); + if to_skip == 0 { + return Ok(0); + } + + // try to consume first value in header. + if let Some(value) = self.first_value.take() { + self.last_value = value; + skip += 1; + self.values_left -= 1; + } + + let mini_block_batch_size = match T::T::PHYSICAL_TYPE { + Type::INT32 => 32, + Type::INT64 => 64, + _ => unreachable!(), + }; + + let mut skip_buffer = vec![T::T::default(); mini_block_batch_size]; + while skip < to_skip { + if self.mini_block_remaining == 0 { + self.next_mini_block()?; + } + + let bit_width = self.mini_block_bit_widths[self.mini_block_idx] as usize; + let mini_block_to_skip = self.mini_block_remaining.min(to_skip - skip); + let mini_block_should_skip = mini_block_to_skip; + + let skip_count = self + .bit_reader + .get_batch(&mut skip_buffer[0..mini_block_to_skip], bit_width); + + if skip_count != mini_block_to_skip { + return Err(general_err!( + "Expected to skip {} values from mini block got {}.", + mini_block_batch_size, + skip_count + )); + } + + for v in &mut skip_buffer[0..skip_count] { + *v = v + .wrapping_add(&self.min_delta) + .wrapping_add(&self.last_value); + + self.last_value = *v; + } + + skip += mini_block_should_skip; + self.mini_block_remaining -= mini_block_should_skip; + self.values_left -= mini_block_should_skip; + } + + Ok(to_skip) } } @@ -751,6 +823,12 @@ pub struct DeltaLengthByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayDecoder { /// Creates new delta length byte array decoder. pub fn new() -> Self { @@ -829,7 +907,10 @@ impl Decoder for DeltaLengthByteArrayDecoder { Type::BYTE_ARRAY => { let num_values = cmp::min(num_values, self.num_values); - let next_offset: i32 = self.lengths[self.current_idx..self.current_idx + num_values].iter().sum(); + let next_offset: i32 = self.lengths + [self.current_idx..self.current_idx + num_values] + .iter() + .sum(); self.current_idx += num_values; self.offset += next_offset as usize; @@ -837,8 +918,9 @@ impl Decoder for DeltaLengthByteArrayDecoder { self.num_values -= num_values; Ok(num_values) } - other_type => Err(general_err!( - "DeltaLengthByteArrayDecoder not support {}, only support byte array", other_type + other_type => Err(general_err!( + "DeltaLengthByteArrayDecoder not support {}, only support byte array", + other_type )), } } @@ -874,6 +956,12 @@ pub struct DeltaByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayDecoder { /// Creates new delta byte array decoder. pub fn new() -> Self { @@ -990,7 +1078,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::{bit_util::set_array_bit, test_common::RandGen}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_get_decoders() { @@ -1068,13 +1156,7 @@ mod tests { fn test_plain_skip_all_int32() { let data = vec![42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1096,7 +1178,6 @@ mod tests { ); } - #[test] fn test_plain_decode_int64() { let data = vec![42, 18, 52]; @@ -1128,16 +1209,9 @@ mod tests { fn test_plain_skip_all_int64() { let data = vec![42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 3, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 3, -1, &[]); } - #[test] fn test_plain_decode_float() { let data = vec![3.14, 2.414, 12.51]; @@ -1169,13 +1243,7 @@ mod tests { fn test_plain_skip_all_float() { let data = vec![3.14, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 4, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); } #[test] @@ -1195,13 +1263,7 @@ mod tests { fn test_plain_skip_all_double() { let data = vec![3.14f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1261,13 +1323,7 @@ mod tests { data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 4, - 8, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 4, 8, -1, &[]); } #[test] @@ -1307,16 +1363,9 @@ mod tests { false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 10, - 20, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 10, 20, -1, &[]); } - #[test] fn test_plain_decode_byte_array() { let mut data = vec![ByteArray::new(); 2]; @@ -1354,13 +1403,7 @@ mod tests { data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); let data_bytes = ByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 2, - 2, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 2, 2, -1, &[]); } #[test] @@ -1587,7 +1630,6 @@ mod tests { ]; test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 5); test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 100); - } #[test] @@ -1833,8 +1875,7 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); for v in &data[..] { encoder.put(&v[..]).expect("ok to encode"); @@ -1867,17 +1908,14 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); encoder.put(&data).expect("ok to encode"); let bytes = encoder.flush_buffer().expect("ok to flush buffer"); let mut decoder = get_decoder::(col_descr, encoding).expect("get decoder"); - decoder - .set_data(bytes, data.len()) - .expect("ok to set data"); + decoder.set_data(bytes, data.len()).expect("ok to set data"); if skip >= data.len() { let skipped = decoder.skip(skip).expect("ok to skip"); @@ -1894,7 +1932,7 @@ mod tests { let expected = &data[skip..]; let mut buffer = vec![T::T::default(); remaining]; let fetched = decoder.get(&mut buffer).expect("ok to decode"); - assert_eq!(remaining,fetched); + assert_eq!(remaining, fetched); assert_eq!(&buffer, expected); } } @@ -1966,7 +2004,7 @@ mod tests { v.push(0); } if *item { - set_array_bit(&mut v[..], i); + v[i / 8] |= 1 << (i % 8); } } v diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index a7855cc84606..18deba65e687 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -73,9 +73,6 @@ impl Storage for KeyStorage { /// (max bit width = 32), followed by the values encoded using RLE/Bit packed described /// above (with the given bit width). pub struct DictEncoder { - /// Descriptor for the column to be encoded. - desc: ColumnDescPtr, - interner: Interner>, /// The buffered indices @@ -92,7 +89,6 @@ impl DictEncoder { }; Self { - desc, interner: Interner::new(storage), indices: vec![], } @@ -117,7 +113,7 @@ impl DictEncoder { /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return /// the result. pub fn write_dict(&self) -> Result { - let mut plain_encoder = PlainEncoder::::new(self.desc.clone(), vec![]); + let mut plain_encoder = PlainEncoder::::new(); plain_encoder.put(&self.interner.storage().uniques)?; plain_encoder.flush_buffer() } diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index b0c8fa10faa7..050f1b9f8a63 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -24,7 +24,6 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; use crate::util::{ bit_util::{self, num_required_bits, BitWriter}, memory::ByteBufferPtr, @@ -76,12 +75,9 @@ pub trait Encoder { /// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage /// for the encoder instance is tracked by `mem_tracker`. -pub fn get_encoder( - desc: ColumnDescPtr, - encoding: Encoding, -) -> Result>> { +pub fn get_encoder(encoding: Encoding) -> Result>> { let encoder: Box> = match encoding { - Encoding::PLAIN => Box::new(PlainEncoder::new(desc, vec![])), + Encoding::PLAIN => Box::new(PlainEncoder::new()), Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { return Err(general_err!( "Cannot initialize this encoding through this function" @@ -113,17 +109,21 @@ pub fn get_encoder( pub struct PlainEncoder { buffer: Vec, bit_writer: BitWriter, - desc: ColumnDescPtr, _phantom: PhantomData, } +impl Default for PlainEncoder { + fn default() -> Self { + Self::new() + } +} + impl PlainEncoder { /// Creates new plain encoder. - pub fn new(desc: ColumnDescPtr, buffer: Vec) -> Self { + pub fn new() -> Self { Self { - buffer, + buffer: vec![], bit_writer: BitWriter::new(256), - desc, _phantom: PhantomData, } } @@ -171,6 +171,12 @@ pub struct RleValueEncoder { _phantom: PhantomData, } +impl Default for RleValueEncoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueEncoder { /// Creates new rle value encoder. pub fn new() -> Self { @@ -241,7 +247,6 @@ impl Encoder for RleValueEncoder { const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32; const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; -const DEFAULT_BLOCK_SIZE: usize = 128; const DEFAULT_NUM_MINI_BLOCKS: usize = 4; /// Delta bit packed encoder. @@ -281,15 +286,28 @@ pub struct DeltaBitPackEncoder { _phantom: PhantomData, } +impl Default for DeltaBitPackEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackEncoder { /// Creates new delta bit packed encoder. pub fn new() -> Self { - let block_size = DEFAULT_BLOCK_SIZE; - let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; - let mini_block_size = block_size / num_mini_blocks; - assert_eq!(mini_block_size % 8, 0); Self::assert_supported_type(); + // Size miniblocks so that they can be efficiently decoded + let mini_block_size = match T::T::PHYSICAL_TYPE { + Type::INT32 => 32, + Type::INT64 => 64, + _ => unreachable!(), + }; + + let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; + let block_size = mini_block_size * num_mini_blocks; + assert_eq!(block_size % 128, 0); + DeltaBitPackEncoder { page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE), bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE), @@ -525,6 +543,12 @@ pub struct DeltaLengthByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayEncoder { /// Creates new delta length byte array encoder. pub fn new() -> Self { @@ -604,6 +628,12 @@ pub struct DeltaByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayEncoder { /// Creates new delta byte array encoder. pub fn new() -> Self { @@ -699,7 +729,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::test_common::{random_bytes, RandGen}; + use crate::util::test_common::rand_gen::{random_bytes, RandGen}; const TEST_SET_SIZE: usize = 1024; @@ -841,7 +871,7 @@ mod tests { Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { Box::new(create_test_dict_encoder::(type_length)) } - _ => create_test_encoder::(type_length, encoding), + _ => create_test_encoder::(encoding), }; assert_eq!(encoder.estimated_data_encoded_size(), initial_size); @@ -894,7 +924,7 @@ mod tests { #[test] fn test_issue_47() { let mut encoder = - create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + create_test_encoder::(Encoding::DELTA_BYTE_ARRAY); let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); @@ -946,7 +976,7 @@ mod tests { impl> EncodingTester for T { fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()> { - let mut encoder = create_test_encoder::(type_length, enc); + let mut encoder = create_test_encoder::(enc); let mut decoder = create_test_decoder::(type_length, enc); let mut values = >::gen_vec(type_length, total); let mut result_data = vec![T::T::default(); total]; @@ -1048,8 +1078,7 @@ mod tests { encoding: Encoding, err: Option, ) { - let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); - let encoder = get_encoder::(descr, encoding); + let encoder = get_encoder::(encoding); match err { Some(parquet_error) => { assert!(encoder.is_err()); @@ -1076,12 +1105,8 @@ mod tests { )) } - fn create_test_encoder( - type_len: i32, - enc: Encoding, - ) -> Box> { - let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); - get_encoder(desc, enc).unwrap() + fn create_test_encoder(enc: Encoding) -> Box> { + get_encoder(enc).unwrap() } fn create_test_decoder( diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 62c68d843c71..95384926ddba 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -23,7 +23,7 @@ use crate::basic::Encoding; use crate::data_type::AsBytes; use crate::errors::Result; use crate::util::{ - bit_util::{ceil, num_required_bits, BitReader, BitWriter}, + bit_util::{ceil, num_required_bits, read_num_bytes, BitReader, BitWriter}, memory::ByteBufferPtr, }; @@ -142,12 +142,14 @@ impl LevelEncoder { /// Decoder for definition/repetition levels. /// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and /// RLE for Data Page v2. +#[allow(unused)] pub enum LevelDecoder { Rle(Option, RleDecoder), RleV2(Option, RleDecoder), BitPacked(Option, u8, BitReader), } +#[allow(unused)] impl LevelDecoder { /// Creates new level decoder based on encoding and max definition/repetition level. /// This method only initializes level decoder, `set_data` method must be called @@ -190,7 +192,7 @@ impl LevelDecoder { LevelDecoder::Rle(ref mut num_values, ref mut decoder) => { *num_values = Some(num_buffered_values); let i32_size = mem::size_of::(); - let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + let data_size = read_num_bytes::(i32_size, data.as_ref()) as usize; decoder.set_data(data.range(i32_size, data_size)); i32_size + data_size } @@ -274,7 +276,7 @@ impl LevelDecoder { mod tests { use super::*; - use crate::util::test_common::random_numbers_range; + use crate::util::test_common::rand_gen::random_numbers_range; fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { let mut encoder = if v2 { diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index aad833e0eee3..39a0aa4d03da 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -45,7 +45,6 @@ use crate::util::{ /// Maximum groups per bit-packed run. Current value is 64. const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; -const MAX_WRITER_BUF_SIZE: usize = 1 << 10; /// A RLE/Bit-Packing hybrid encoder. // TODO: tracking memory usage @@ -56,9 +55,6 @@ pub struct RleEncoder { // Underlying writer which holds an internal buffer. bit_writer: BitWriter, - // The maximum byte size a single run can take. - max_run_byte_size: usize, - // Buffered values for bit-packed runs. buffered_values: [u64; 8], @@ -82,6 +78,7 @@ pub struct RleEncoder { } impl RleEncoder { + #[allow(unused)] pub fn new(bit_width: u8, buffer_len: usize) -> Self { let buffer = Vec::with_capacity(buffer_len); RleEncoder::new_from_buf(bit_width, buffer) @@ -89,12 +86,10 @@ impl RleEncoder { /// Initialize the encoder from existing `buffer` pub fn new_from_buf(bit_width: u8, buffer: Vec) -> Self { - let max_run_byte_size = RleEncoder::min_buffer_size(bit_width); let bit_writer = BitWriter::new_from_buf(buffer); RleEncoder { bit_width, bit_writer, - max_run_byte_size, buffered_values: [0; 8], num_buffered_values: 0, current_value: 0, @@ -162,6 +157,7 @@ impl RleEncoder { } #[inline] + #[allow(unused)] pub fn buffer(&self) -> &[u8] { self.bit_writer.buffer() } @@ -171,6 +167,7 @@ impl RleEncoder { self.bit_writer.bytes_written() } + #[allow(unused)] pub fn is_empty(&self) -> bool { self.bit_writer.bytes_written() == 0 } @@ -184,6 +181,7 @@ impl RleEncoder { /// Borrow equivalent of the `consume` method. /// Call `clear()` after invoking this method. #[inline] + #[allow(unused)] pub fn flush_buffer(&mut self) -> &[u8] { self.flush(); self.bit_writer.flush_buffer() @@ -192,6 +190,7 @@ impl RleEncoder { /// Clears the internal state so this encoder can be reused (e.g., after becoming /// full). #[inline] + #[allow(unused)] pub fn clear(&mut self) { self.bit_writer.clear(); self.num_buffered_values = 0; diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index c2fb5bd66cf9..c4f5faaaacae 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -22,7 +22,7 @@ use std::{cell, io, result, str}; #[cfg(any(feature = "arrow", test))] use arrow::error::ArrowError; -#[derive(Debug, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone, Eq)] pub enum ParquetError { /// General Parquet error. /// Returned when code violates normal workflow of working with Parquet files. @@ -148,8 +148,8 @@ macro_rules! arrow_err { // Convert parquet error into other errors #[cfg(any(feature = "arrow", test))] -impl Into for ParquetError { - fn into(self) -> ArrowError { - ArrowError::ParquetError(format!("{}", self)) +impl From for ArrowError { + fn from(p: ParquetError) -> Self { + Self::ParquetError(format!("{}", p)) } } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 58eaf7a8c875..018dd95d9f35 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -834,6 +834,12 @@ pub struct ColumnIndexBuilder { valid: bool, } +impl Default for ColumnIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl ColumnIndexBuilder { pub fn new() -> Self { ColumnIndexBuilder { @@ -887,6 +893,12 @@ pub struct OffsetIndexBuilder { current_first_row_index: i64, } +impl Default for OffsetIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl OffsetIndexBuilder { pub fn new() -> Self { OffsetIndexBuilder { diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 3180c7820802..e499a094ae00 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -21,7 +21,7 @@ use parquet_format::{ }; /// PageEncodingStats for a column chunk and data page. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct PageEncodingStats { /// the page type (data/dic/...) pub page_type: PageType, diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 45381234c027..f29b80accae2 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -47,6 +47,7 @@ impl PageIndex { } #[derive(Debug, Clone, PartialEq)] +#[allow(non_camel_case_types)] pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index fc87ef20448f..bb7808f16487 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -17,4 +17,6 @@ pub mod index; pub mod index_reader; + +#[cfg(test)] pub(crate) mod range; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs index 06c06553ccd5..e9741ec8e7fd 100644 --- a/parquet/src/file/page_index/range.rs +++ b/parquet/src/file/page_index/range.rs @@ -213,6 +213,7 @@ impl RowRanges { result } + #[allow(unused)] pub fn row_count(&self) -> usize { self.ranges.iter().map(|x| x.count()).sum() } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 9ca7c4daa597..57dae323d892 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -68,7 +68,8 @@ const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); /// Parquet writer version. /// /// Basic constant, which is not part of the Thrift definition. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum WriterVersion { PARQUET_1_0, PARQUET_2_0, @@ -360,7 +361,7 @@ impl WriterPropertiesBuilder { fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { self.column_properties .entry(col) - .or_insert(ColumnProperties::new()) + .or_insert_with(ColumnProperties::new) } /// Sets encoding for a column. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 766813f11aee..0b7451f4bea7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -39,8 +39,7 @@ use crate::util::{io::TryClone, memory::ByteBufferPtr}; // export `SliceableCursor` and `FileSource` publically so clients can // re-use the logic in their own ParquetFileWriter wrappers -#[allow(deprecated)] -pub use crate::util::{cursor::SliceableCursor, io::FileSource}; +pub use crate::util::io::FileSource; // ---------------------------------------------------------------------- // Implementations of traits facilitating the creation of a new reader @@ -86,22 +85,6 @@ impl ChunkReader for Bytes { } } -#[allow(deprecated)] -impl Length for SliceableCursor { - fn len(&self) -> u64 { - SliceableCursor::len(self) - } -} - -#[allow(deprecated)] -impl ChunkReader for SliceableCursor { - type T = SliceableCursor; - - fn get_read(&self, start: u64, length: usize) -> Result { - self.slice(start, length).map_err(|e| e.into()) - } -} - impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -155,29 +138,29 @@ pub struct SerializedFileReader { metadata: ParquetMetaData, } +/// A predicate for filtering row groups, invoked with the metadata and index +/// of each row group in the file. Only row groups for which the predicate +/// evaluates to `true` will be scanned +pub type ReadGroupPredicate = Box bool>; + /// A builder for [`ReadOptions`]. /// For the predicates that are added to the builder, /// they will be chained using 'AND' to filter the row groups. +#[derive(Default)] pub struct ReadOptionsBuilder { - predicates: Vec bool>>, + predicates: Vec, enable_page_index: bool, } impl ReadOptionsBuilder { /// New builder pub fn new() -> Self { - ReadOptionsBuilder { - predicates: vec![], - enable_page_index: false, - } + Self::default() } /// Add a predicate on row group metadata to the reading option, /// Filter only row groups that match the predicate criteria - pub fn with_predicate( - mut self, - predicate: Box bool>, - ) -> Self { + pub fn with_predicate(mut self, predicate: ReadGroupPredicate) -> Self { self.predicates.push(predicate); self } @@ -214,7 +197,7 @@ impl ReadOptionsBuilder { /// Currently, only predicates on row group metadata are supported. /// All predicates will be chained using 'AND' to filter the row groups. pub struct ReadOptions { - predicates: Vec bool>>, + predicates: Vec, enable_page_index: bool, } @@ -709,7 +692,7 @@ mod tests { use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::bit_util::from_le_slice; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use parquet_format::BoundaryOrder; use std::sync::Arc; @@ -1512,7 +1495,9 @@ mod tests { if i != 351 { assert!((meta.num_rows == 21) || (meta.num_rows == 20)); } else { - assert_eq!(meta.num_rows, 11); + // last page first row index is 7290, total row count is 7300 + // because first row start with zero, last page row count should be 10. + assert_eq!(meta.num_rows, 10); } assert!(!meta.is_dict); vec.push(meta); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 467273aaab9d..863ccf85468d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -40,7 +40,6 @@ use crate::file::{ use crate::schema::types::{ self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr, }; -use crate::util::io::TryClone; /// A wrapper around a [`Write`] that keeps track of the number /// of bytes that have been written @@ -109,11 +108,6 @@ pub type OnCloseRowGroup<'a> = Box< + 'a, >; -#[deprecated = "use std::io::Write"] -pub trait ParquetWriter: Write + std::io::Seek + TryClone {} -#[allow(deprecated)] -impl ParquetWriter for T {} - // ---------------------------------------------------------------------- // Serialized impl for file & row group writers diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index d4eaaf41686a..90fe399e78d7 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -19,6 +19,9 @@ //! [Apache Parquet](https://parquet.apache.org/), part of //! the [Apache Arrow](https://arrow.apache.org/) project. //! +//! Please see the [parquet crates.io](https://crates.io/crates/parquet) +//! page for feature flags and tips to improve performance. +//! //! # Getting Started //! Start with some examples: //! @@ -30,14 +33,6 @@ //! //! 3. [arrow::async_reader] for `async` reading and writing parquet //! files to Arrow `RecordBatch`es (requires the `async` feature). -#![allow(dead_code)] -#![allow(non_camel_case_types)] -#![allow( - clippy::from_over_into, - clippy::new_without_default, - clippy::or_fun_call, - clippy::too_many_arguments -)] /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 05b63661f09b..0b7e04587354 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -40,6 +40,12 @@ pub struct TreeBuilder { batch_size: usize, } +impl Default for TreeBuilder { + fn default() -> Self { + Self::new() + } +} + impl TreeBuilder { /// Creates new tree builder with default parameters. pub fn new() -> Self { @@ -822,7 +828,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; use crate::schema::parser::parse_message_type; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use std::convert::TryFrom; // Convenient macros to assemble row, list, map, and group. diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index de566a122e20..5a7e2a0ca74e 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -363,7 +363,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::schema::types::ColumnPath; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 8d624fe3d185..823803167ca1 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -593,7 +593,7 @@ impl<'a> GroupTypeBuilder<'a> { /// Basic type info. This contains information such as the name of the type, /// the repetition level, the logical type and the kind of the type (group, primitive). -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct BasicTypeInfo { name: String, repetition: Option, diff --git a/parquet/src/util/bit_pack.rs b/parquet/src/util/bit_pack.rs index b268aa567632..8cea20de2539 100644 --- a/parquet/src/util/bit_pack.rs +++ b/parquet/src/util/bit_pack.rs @@ -97,7 +97,6 @@ unpack!(unpack64, u64, 8, 64); #[cfg(test)] mod tests { use super::*; - use rand::{thread_rng, Rng}; #[test] fn test_basic() { diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 1dec9b03f082..5d76a8dbf47d 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -88,49 +88,17 @@ impl FromBytes for bool { from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 } -/// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in -/// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't -/// compile. +/// Reads `size` of bytes from `src`, and reinterprets them as type `ty`, in +/// little-endian order. /// This is copied and modified from byteorder crate. -macro_rules! read_num_bytes { - ($ty:ty, $size:expr, $src:expr) => {{ - assert!($size <= $src.len()); - let mut buffer = <$ty as $crate::util::bit_util::FromBytes>::Buffer::default(); - buffer.as_mut()[..$size].copy_from_slice(&$src[..$size]); - <$ty>::from_ne_bytes(buffer) - }}; -} - -/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes` from `val`. -/// NOTE: if `val` is less than the size of `T` then it can be truncated. -#[inline] -pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec -where - T: ?Sized + AsBytes, -{ - let mut bytes: Vec = vec![0; num_bytes]; - memcpy_value(val.as_bytes(), num_bytes, &mut bytes); - bytes -} - -#[inline] -pub fn memcpy(source: &[u8], target: &mut [u8]) { - assert!(target.len() >= source.len()); - target[..source.len()].copy_from_slice(source) -} - -#[inline] -pub fn memcpy_value(source: &T, num_bytes: usize, target: &mut [u8]) +pub(crate) fn read_num_bytes(size: usize, src: &[u8]) -> T where - T: ?Sized + AsBytes, + T: FromBytes, { - assert!( - target.len() >= num_bytes, - "Not enough space. Only had {} bytes but need to put {} bytes", - target.len(), - num_bytes - ); - memcpy(&source.as_bytes()[..num_bytes], target) + assert!(size <= src.len()); + let mut buffer = ::Buffer::default(); + buffer.as_mut()[..size].copy_from_slice(&src[..size]); + ::from_ne_bytes(buffer) } /// Returns the ceil of value/divisor. @@ -152,16 +120,6 @@ pub fn trailing_bits(v: u64, num_bits: usize) -> u64 { } } -#[inline] -pub fn set_array_bit(bits: &mut [u8], i: usize) { - bits[i / 8] |= 1 << (i % 8); -} - -#[inline] -pub fn unset_array_bit(bits: &mut [u8], i: usize) { - bits[i / 8] &= !(1 << (i % 8)); -} - /// Returns the minimum number of bits needed to represent the value 'x' #[inline] pub fn num_required_bits(x: u64) -> u8 { @@ -383,7 +341,7 @@ impl BitReader { pub fn new(buffer: ByteBufferPtr) -> Self { let total_bytes = buffer.len(); let num_bytes = cmp::min(8, total_bytes); - let buffered_values = read_num_bytes!(u64, num_bytes, buffer.as_ref()); + let buffered_values = read_num_bytes::(num_bytes, buffer.as_ref()); BitReader { buffer, buffered_values, @@ -397,7 +355,7 @@ impl BitReader { self.buffer = buffer; self.total_bytes = self.buffer.len(); let num_bytes = cmp::min(8, self.total_bytes); - self.buffered_values = read_num_bytes!(u64, num_bytes, self.buffer.as_ref()); + self.buffered_values = read_num_bytes::(num_bytes, self.buffer.as_ref()); self.byte_offset = 0; self.bit_offset = 0; } @@ -666,7 +624,7 @@ impl BitReader { // Advance byte_offset to next unread byte and read num_bytes self.byte_offset += bytes_read; - let v = read_num_bytes!(T, num_bytes, self.buffer.data()[self.byte_offset..]); + let v = read_num_bytes::(num_bytes, &self.buffer.data()[self.byte_offset..]); self.byte_offset += num_bytes; // Reset buffered_values @@ -717,7 +675,7 @@ impl BitReader { fn reload_buffer_values(&mut self) { let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); self.buffered_values = - read_num_bytes!(u64, bytes_to_read, self.buffer.data()[self.byte_offset..]); + read_num_bytes::(bytes_to_read, &self.buffer.data()[self.byte_offset..]); } } @@ -728,20 +686,11 @@ impl From> for BitReader { } } -/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must -/// be a power of 2. -/// -/// Copied from the arrow crate to make arrow optional -pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize { - debug_assert!(factor > 0 && (factor & (factor - 1)) == 0); - (num + (factor - 1)) & !(factor - 1) -} - #[cfg(test)] mod tests { - use super::super::test_common::*; use super::*; + use crate::util::test_common::rand_gen::random_numbers; use rand::distributions::{Distribution, Standard}; use std::fmt::Debug; @@ -874,25 +823,6 @@ mod tests { assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2)); } - #[test] - fn test_set_array_bit() { - let mut buffer = vec![0, 0, 0]; - set_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![2, 0, 0]); - set_array_bit(&mut buffer[..], 4); - assert_eq!(buffer, vec![18, 0, 0]); - unset_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![16, 0, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 11); - assert_eq!(buffer, vec![16, 12, 0]); - unset_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 8, 0]); - } - #[test] fn test_num_required_bits() { assert_eq!(num_required_bits(0), 0); diff --git a/parquet/src/util/cursor.rs b/parquet/src/util/cursor.rs deleted file mode 100644 index 706724dbf52a..000000000000 --- a/parquet/src/util/cursor.rs +++ /dev/null @@ -1,284 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::util::io::TryClone; -use std::io::{self, Cursor, Error, ErrorKind, Read, Seek, SeekFrom, Write}; -use std::sync::{Arc, Mutex}; -use std::{cmp, fmt}; - -/// This is object to use if your file is already in memory. -/// The sliceable cursor is similar to std::io::Cursor, except that it makes it easy to create "cursor slices". -/// To achieve this, it uses Arc instead of shared references. Indeed reference fields are painful -/// because the lack of Generic Associated Type implies that you would require complex lifetime propagation when -/// returning such a cursor. -#[allow(clippy::rc_buffer)] -#[deprecated = "use bytes::Bytes instead"] -pub struct SliceableCursor { - inner: Arc>, - start: u64, - length: usize, - pos: u64, -} - -#[allow(deprecated)] -impl fmt::Debug for SliceableCursor { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SliceableCursor") - .field("start", &self.start) - .field("length", &self.length) - .field("pos", &self.pos) - .field("inner.len", &self.inner.len()) - .finish() - } -} - -#[allow(deprecated)] -impl SliceableCursor { - pub fn new(content: impl Into>>) -> Self { - let inner = content.into(); - let size = inner.len(); - SliceableCursor { - inner, - start: 0, - pos: 0, - length: size, - } - } - - /// Create a slice cursor using the same data as a current one. - pub fn slice(&self, start: u64, length: usize) -> io::Result { - let new_start = self.start + start; - if new_start >= self.inner.len() as u64 - || new_start as usize + length > self.inner.len() - { - return Err(Error::new(ErrorKind::InvalidInput, "out of bound")); - } - Ok(SliceableCursor { - inner: Arc::clone(&self.inner), - start: new_start, - pos: new_start, - length, - }) - } - - fn remaining_slice(&self) -> &[u8] { - let end = self.start as usize + self.length; - let offset = cmp::min(self.pos, end as u64) as usize; - &self.inner[offset..end] - } - - /// Get the length of the current cursor slice - pub fn len(&self) -> u64 { - self.length as u64 - } - - /// return true if the cursor is empty (self.len() == 0) - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// Implementation inspired by std::io::Cursor -#[allow(deprecated)] -impl Read for SliceableCursor { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = Read::read(&mut self.remaining_slice(), buf)?; - self.pos += n as u64; - Ok(n) - } -} - -#[allow(deprecated)] -impl Seek for SliceableCursor { - fn seek(&mut self, pos: SeekFrom) -> io::Result { - let new_pos = match pos { - SeekFrom::Start(pos) => pos as i64, - SeekFrom::End(pos) => self.inner.len() as i64 + pos as i64, - SeekFrom::Current(pos) => self.pos as i64 + pos as i64, - }; - - if new_pos < 0 { - Err(Error::new( - ErrorKind::InvalidInput, - format!( - "Request out of bounds: cur position {} + seek {:?} < 0: {}", - self.pos, pos, new_pos - ), - )) - } else if new_pos >= self.inner.len() as i64 { - Err(Error::new( - ErrorKind::InvalidInput, - format!( - "Request out of bounds: cur position {} + seek {:?} >= length {}: {}", - self.pos, - pos, - self.inner.len(), - new_pos - ), - )) - } else { - self.pos = new_pos as u64; - Ok(self.start) - } - } -} - -/// Use this type to write Parquet to memory rather than a file. -#[deprecated = "use Vec instead"] -#[derive(Debug, Default, Clone)] -pub struct InMemoryWriteableCursor { - buffer: Arc>>>, -} - -#[allow(deprecated)] -impl InMemoryWriteableCursor { - /// Consume this instance and return the underlying buffer as long as there are no other - /// references to this instance. - pub fn into_inner(self) -> Option> { - Arc::try_unwrap(self.buffer) - .ok() - .and_then(|mutex| mutex.into_inner().ok()) - .map(|cursor| cursor.into_inner()) - } - - /// Returns a clone of the underlying buffer - pub fn data(&self) -> Vec { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().to_vec() - } - - /// Returns a length of the underlying buffer - pub fn len(&self) -> usize { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().len() - } - - /// Returns true if the underlying buffer contains no elements - pub fn is_empty(&self) -> bool { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().is_empty() - } -} - -#[allow(deprecated)] -impl TryClone for InMemoryWriteableCursor { - fn try_clone(&self) -> std::io::Result { - Ok(Self { - buffer: self.buffer.clone(), - }) - } -} - -#[allow(deprecated)] -impl Write for InMemoryWriteableCursor { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - let mut inner = self.buffer.lock().unwrap(); - inner.write(buf) - } - - fn flush(&mut self) -> std::io::Result<()> { - let mut inner = self.buffer.lock().unwrap(); - inner.flush() - } -} - -#[allow(deprecated)] -impl Seek for InMemoryWriteableCursor { - fn seek(&mut self, pos: SeekFrom) -> std::io::Result { - let mut inner = self.buffer.lock().unwrap(); - inner.seek(pos) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - /// Create a SliceableCursor of all u8 values in ascending order - #[allow(deprecated)] - fn get_u8_range() -> SliceableCursor { - let data: Vec = (0u8..=255).collect(); - SliceableCursor::new(data) - } - - /// Reads all the bytes in the slice and checks that it matches the u8 range from start to end_included - #[allow(deprecated)] - fn check_read_all(mut cursor: SliceableCursor, start: u8, end_included: u8) { - let mut target = vec![]; - let cursor_res = cursor.read_to_end(&mut target); - println!("{:?}", cursor_res); - assert!(cursor_res.is_ok(), "reading error"); - assert_eq!((end_included - start) as usize + 1, cursor_res.unwrap()); - assert_eq!((start..=end_included).collect::>(), target); - } - - #[test] - fn read_all_whole() { - let cursor = get_u8_range(); - check_read_all(cursor, 0, 255); - } - - #[test] - fn read_all_slice() { - let cursor = get_u8_range().slice(10, 10).expect("error while slicing"); - check_read_all(cursor, 10, 19); - } - - #[test] - fn seek_cursor_start() { - let mut cursor = get_u8_range(); - - cursor.seek(SeekFrom::Start(5)).unwrap(); - check_read_all(cursor, 5, 255); - } - - #[test] - fn seek_cursor_current() { - let mut cursor = get_u8_range(); - cursor.seek(SeekFrom::Start(10)).unwrap(); - cursor.seek(SeekFrom::Current(10)).unwrap(); - check_read_all(cursor, 20, 255); - } - - #[test] - fn seek_cursor_end() { - let mut cursor = get_u8_range(); - - cursor.seek(SeekFrom::End(-10)).unwrap(); - check_read_all(cursor, 246, 255); - } - - #[test] - fn seek_cursor_error_too_long() { - let mut cursor = get_u8_range(); - let res = cursor.seek(SeekFrom::Start(1000)); - let actual_error = res.expect_err("expected error").to_string(); - let expected_error = - "Request out of bounds: cur position 0 + seek Start(1000) >= length 256: 1000"; - assert_eq!(actual_error, expected_error); - } - - #[test] - fn seek_cursor_error_too_short() { - let mut cursor = get_u8_range(); - let res = cursor.seek(SeekFrom::End(-1000)); - let actual_error = res.expect_err("expected error").to_string(); - let expected_error = - "Request out of bounds: cur position 0 + seek End(-1000) < 0: -744"; - assert_eq!(actual_error, expected_error); - } -} diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index 319750dd1013..e638237e06c5 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -18,7 +18,6 @@ use crate::data_type::AsBytes; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use std::hash::Hash; const DEFAULT_DEDUP_CAPACITY: usize = 4096; @@ -62,7 +61,7 @@ impl Interner { /// Intern the value, returning the interned key, and if this was a new value pub fn intern(&mut self, value: &S::Value) -> S::Key { - let hash = compute_hash(&self.state, value); + let hash = self.state.hash_one(value.as_bytes()); let entry = self .dedup @@ -76,7 +75,7 @@ impl Interner { *entry .insert_with_hasher(hash, key, (), |key| { - compute_hash(&self.state, self.storage.get(*key)) + self.state.hash_one(self.storage.get(*key).as_bytes()) }) .0 } @@ -93,10 +92,3 @@ impl Interner { self.storage } } - -fn compute_hash(state: &ahash::RandomState, value: &T) -> u64 { - use std::hash::{BuildHasher, Hasher}; - let mut hasher = state.build_hasher(); - value.as_bytes().hash(&mut hasher); - hasher.finish() -} diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs index a7b5e73074c6..1fb92063e27c 100644 --- a/parquet/src/util/io.rs +++ b/parquet/src/util/io.rs @@ -18,8 +18,6 @@ use std::{cell::RefCell, cmp, fmt, io::*}; use crate::file::reader::Length; -#[allow(deprecated)] -use crate::file::writer::ParquetWriter; const DEFAULT_BUF_SIZE: usize = 8 * 1024; @@ -156,51 +154,6 @@ impl Length for FileSource { } } -/// Struct that represents `File` output stream with position tracking. -/// Used as a sink in file writer. -#[deprecated = "use TrackedWrite instead"] -#[allow(deprecated)] -pub struct FileSink { - buf: BufWriter, - // This is not necessarily position in the underlying file, - // but rather current position in the sink. - pos: u64, -} - -#[allow(deprecated)] -impl FileSink { - /// Creates new file sink. - /// Position is set to whatever position file has. - pub fn new(buf: &W) -> Self { - let mut owned_buf = buf.try_clone().unwrap(); - let pos = owned_buf.seek(SeekFrom::Current(0)).unwrap(); - Self { - buf: BufWriter::new(owned_buf), - pos, - } - } -} - -#[allow(deprecated)] -impl Write for FileSink { - fn write(&mut self, buf: &[u8]) -> Result { - let num_bytes = self.buf.write(buf)?; - self.pos += num_bytes as u64; - Ok(num_bytes) - } - - fn flush(&mut self) -> Result<()> { - self.buf.flush() - } -} - -#[allow(deprecated)] -impl Position for FileSink { - fn pos(&self) -> u64 { - self.pos - } -} - // Position implementation for Cursor to use in various tests. impl<'a> Position for Cursor<&'a mut Vec> { fn pos(&self) -> u64 { @@ -214,7 +167,7 @@ mod tests { use std::iter; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] fn test_io_read_fully() { @@ -277,30 +230,6 @@ mod tests { assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); } - #[test] - #[allow(deprecated)] - fn test_io_write_with_pos() { - let mut file = tempfile::tempfile().unwrap(); - file.write_all(&[b'a', b'b', b'c']).unwrap(); - - // Write into sink - let mut sink = FileSink::new(&file); - assert_eq!(sink.pos(), 3); - - sink.write_all(&[b'd', b'e', b'f', b'g']).unwrap(); - assert_eq!(sink.pos(), 7); - - sink.flush().unwrap(); - assert_eq!(sink.pos(), file.seek(SeekFrom::Current(0)).unwrap()); - - // Read data using file chunk - let mut res = vec![0u8; 7]; - let mut chunk = - FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); - chunk.read_exact(&mut res[..]).unwrap(); - assert_eq!(res, vec![b'a', b'b', b'c', b'd', b'e', b'f', b'g']); - } - #[test] fn test_io_large_read() { // Generate repeated 'abcdef' pattern and write it into a file diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index 8510b1c2c786..d8ad739dbdb4 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -20,7 +20,6 @@ pub mod memory; #[macro_use] pub mod bit_util; mod bit_pack; -pub mod cursor; pub(crate) mod interner; pub(crate) mod page_util; #[cfg(any(test, feature = "test_common"))] diff --git a/parquet/src/util/page_util.rs b/parquet/src/util/page_util.rs index 5cdcf7535c63..7716b71167fb 100644 --- a/parquet/src/util/page_util.rs +++ b/parquet/src/util/page_util.rs @@ -25,7 +25,8 @@ use crate::file::reader::ChunkReader; /// Use column chunk's offset index to get the `page_num` page row count. pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result { if page_num == indexes.len() - 1 { - Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize) + // first_row_index start with 0, so no need to plus one additional. + Ok((total_row_count - indexes[page_num].first_row_index) as usize) } else { Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize) } @@ -52,3 +53,44 @@ pub(crate) fn get_pages_readable_slices>(col } Ok((page_readers, has_dictionary_page)) } + +#[cfg(test)] +mod tests { + use super::*; + + /** + parquet-tools meta ./test.parquet got: + + file schema: test_schema + -------------------------------------------------------------------------------- + leaf: REQUIRED INT64 R:0 D: + + row group 1: RC:256 TS:2216 OFFSET:4 + -------------------------------------------------------------------------------- + leaf: INT64 UNCOMPRESSED DO:0 FPO:4 SZ:2216/2216/1.00 VC:256 ENC:PLAIN,RLE ST:[min: 0, max: 255, num_nulls not defined + + parquet-tools column-index -c leaf ./test.parquet got: + + offset index for column leaf: + offset compressed size first row index + page-0 4 554 0 + page-1 558 554 64 + page-2 1112 554 128 + page-3 1666 554 192 + + **/ + #[test] + fn test_calculate_row_count() { + let total_row_count = 256; + let mut indexes = vec![]; + indexes.push(PageLocation::new(4, 554, 0)); + indexes.push(PageLocation::new(558, 554, 64)); + indexes.push(PageLocation::new(1112, 554, 128)); + indexes.push(PageLocation::new(1666, 554, 192)); + for i in 0..4 { + // each page should has 64 rows. + assert_eq!(64, calculate_row_count(indexes.as_slice(), i, total_row_count).unwrap()); + } + + } +} diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index f0beb16ca954..504219ecae19 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -15,17 +15,10 @@ // specific language governing permissions and limitations // under the License. -pub mod file_util; pub mod page_util; -pub mod rand_gen; - -pub use self::rand_gen::random_bools; -pub use self::rand_gen::random_bytes; -pub use self::rand_gen::random_numbers; -pub use self::rand_gen::random_numbers_range; -pub use self::rand_gen::RandGen; -pub use self::file_util::get_test_file; -pub use self::file_util::get_test_path; +#[cfg(test)] +pub mod file_util; -pub use self::page_util::make_pages; +#[cfg(test)] +pub mod rand_gen; \ No newline at end of file diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index bc197d00e00d..243fb6f8b897 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -19,14 +19,12 @@ use crate::basic::Encoding; use crate::column::page::{Page, PageIterator}; use crate::column::page::{PageMetadata, PageReader}; use crate::data_type::DataType; -use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; +use crate::encodings::encoding::{get_encoder, Encoder}; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; -use crate::util::test_common::random_numbers_range; -use rand::distributions::uniform::SampleUniform; -use std::collections::VecDeque; +use std::iter::Peekable; use std::mem; pub trait DataPageBuilder { @@ -44,7 +42,6 @@ pub trait DataPageBuilder { /// - consume() /// in order to populate and obtain a data page. pub struct DataPageBuilderImpl { - desc: ColumnDescPtr, encoding: Option, num_values: u32, buffer: Vec, @@ -57,9 +54,8 @@ impl DataPageBuilderImpl { // `num_values` is the number of non-null values to put in the data page. // `datapage_v2` flag is used to indicate if the generated data page should use V2 // format or not. - pub fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { + pub fn new(_desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { DataPageBuilderImpl { - desc, encoding: None, num_values, buffer: vec![], @@ -111,8 +107,7 @@ impl DataPageBuilder for DataPageBuilderImpl { ); self.encoding = Some(encoding); let mut encoder: Box> = - get_encoder::(self.desc.clone(), encoding) - .expect("get_encoder() should be OK"); + get_encoder::(encoding).expect("get_encoder() should be OK"); encoder.put(values).expect("put() should be OK"); let encoded_values = encoder .flush_buffer() @@ -133,8 +128,8 @@ impl DataPageBuilder for DataPageBuilderImpl { encoding: self.encoding.unwrap(), num_nulls: 0, /* set to dummy value - don't need this when reading * data page */ - num_rows: self.num_values, /* also don't need this when reading - * data page */ + num_rows: self.num_values, /* num_rows only needs in skip_records, now we not support skip REPEATED field, + * so we can assume num_values == num_rows */ def_levels_byte_len: self.def_levels_byte_len, rep_levels_byte_len: self.rep_levels_byte_len, is_compressed: false, @@ -155,13 +150,13 @@ impl DataPageBuilder for DataPageBuilderImpl { /// A utility page reader which stores pages in memory. pub struct InMemoryPageReader> { - page_iter: P, + page_iter: Peekable

, } impl> InMemoryPageReader

{ pub fn new(pages: impl IntoIterator) -> Self { Self { - page_iter: pages.into_iter(), + page_iter: pages.into_iter().peekable(), } } } @@ -172,11 +167,29 @@ impl + Send> PageReader for InMemoryPageReader

{ } fn peek_next_page(&mut self) -> Result> { - unimplemented!() + if let Some(x) = self.page_iter.peek() { + match x { + Page::DataPage { num_values, .. } => Ok(Some(PageMetadata { + num_rows: *num_values as usize, + is_dict: false, + })), + Page::DataPageV2 { num_rows, .. } => Ok(Some(PageMetadata { + num_rows: *num_rows as usize, + is_dict: false, + })), + Page::DictionaryPage { .. } => Ok(Some(PageMetadata { + num_rows: 0, + is_dict: true, + })), + } + } else { + Ok(None) + } } fn skip_next_page(&mut self) -> Result<()> { - unimplemented!() + self.page_iter.next(); + Ok(()) } } @@ -229,88 +242,3 @@ impl> + Send> PageIterator for InMemoryPageIterator Ok(self.column_desc.clone()) } } - -pub fn make_pages( - desc: ColumnDescPtr, - encoding: Encoding, - num_pages: usize, - levels_per_page: usize, - min: T::T, - max: T::T, - def_levels: &mut Vec, - rep_levels: &mut Vec, - values: &mut Vec, - pages: &mut VecDeque, - use_v2: bool, -) where - T::T: PartialOrd + SampleUniform + Copy, -{ - let mut num_values = 0; - let max_def_level = desc.max_def_level(); - let max_rep_level = desc.max_rep_level(); - - let mut dict_encoder = DictEncoder::::new(desc.clone()); - - for i in 0..num_pages { - let mut num_values_cur_page = 0; - let level_range = i * levels_per_page..(i + 1) * levels_per_page; - - if max_def_level > 0 { - random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); - for dl in &def_levels[level_range.clone()] { - if *dl == max_def_level { - num_values_cur_page += 1; - } - } - } else { - num_values_cur_page = levels_per_page; - } - if max_rep_level > 0 { - random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); - } - random_numbers_range(num_values_cur_page, min, max, values); - - // Generate the current page - - let mut pb = - DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); - if max_rep_level > 0 { - pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); - } - if max_def_level > 0 { - pb.add_def_levels(max_def_level, &def_levels[level_range]); - } - - let value_range = num_values..num_values + num_values_cur_page; - match encoding { - Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { - let _ = dict_encoder.put(&values[value_range.clone()]); - let indices = dict_encoder - .write_indices() - .expect("write_indices() should be OK"); - pb.add_indices(indices); - } - Encoding::PLAIN => { - pb.add_values::(encoding, &values[value_range]); - } - enc => panic!("Unexpected encoding {}", enc), - } - - let data_page = pb.consume(); - pages.push_back(data_page); - num_values += num_values_cur_page; - } - - if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { - let dict = dict_encoder - .write_dict() - .expect("write_dict() should be OK"); - let dict_page = Page::DictionaryPage { - buf: dict, - num_values: dict_encoder.num_entries() as u32, - encoding: Encoding::RLE_DICTIONARY, - is_sorted: false, - }; - pages.push_front(dict_page); - } -} diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index d9c256577684..4e54aa7999cf 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -15,13 +15,19 @@ // specific language governing permissions and limitations // under the License. +use crate::basic::Encoding; +use crate::column::page::Page; use rand::{ distributions::{uniform::SampleUniform, Distribution, Standard}, thread_rng, Rng, }; +use std::collections::VecDeque; use crate::data_type::*; +use crate::encodings::encoding::{DictEncoder, Encoder}; +use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; +use crate::util::{DataPageBuilder, DataPageBuilderImpl}; /// Random generator of data type `T` values and sequences. pub trait RandGen { @@ -106,15 +112,6 @@ pub fn random_bytes(n: usize) -> Vec { result } -pub fn random_bools(n: usize) -> Vec { - let mut result = vec![]; - let mut rng = thread_rng(); - for _ in 0..n { - result.push(rng.gen::()); - } - result -} - pub fn random_numbers(n: usize) -> Vec where Standard: Distribution, @@ -132,3 +129,89 @@ where result.push(rng.gen_range(low..high)); } } + +#[allow(clippy::too_many_arguments)] +pub fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, +) where + T::T: PartialOrd + SampleUniform + Copy, +{ + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mut dict_encoder = DictEncoder::::new(desc.clone()); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = + DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } +} diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 16e19df57af6..3f586de6928a 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "19.0.0" } +parquet = { path = "../parquet", version = "20.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 9f35c064a776..30d5e339f26c 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "19.0.0" -parquet_derive = "19.0.0" +parquet = "20.0.0" +parquet_derive = "20.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index d03ea2359840..bf3e78b247ec 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "19.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "19.0.0", default-features = false } +parquet = { path = "../parquet", version = "20.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "20.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] }