diff --git a/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml b/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml index 6b8b7b4c13..08dac0fe13 100644 --- a/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml @@ -9,7 +9,8 @@ body: description: What Apache Iceberg version are you using? multiple: false options: - - "0.7.1 (latest release)" + - "0.8.0 (latest release)" + - "0.7.1" - "0.7.0" - "0.6.1" - "0.6.0" diff --git a/dev/Dockerfile b/dev/Dockerfile index 02affa78e2..5f6214a4f6 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -39,7 +39,7 @@ WORKDIR ${SPARK_HOME} ENV SPARK_VERSION=3.5.0 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ENV ICEBERG_VERSION=1.6.0 -ENV PYICEBERG_VERSION=0.7.1 +ENV PYICEBERG_VERSION=0.8.0 RUN curl --retry 3 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index f79f18ca82..bea5548748 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -17,15 +17,31 @@ - under the License. --> -# How to release +# How to Release -The guide to release PyIceberg. +This guide outlines the process for releasing PyIceberg in accordance with the [Apache Release Process](https://infra.apache.org/release-publishing.html). The steps include: -The first step is to publish a release candidate (RC) and publish it to the public for testing and validation. Once the vote has passed on the RC, the RC turns into the new release. +1. Preparing for a release +2. Publishing a Release Candidate (RC) +3. Community Voting and Validation +4. Publishing the Final Release (if the vote passes) +5. Post-Release Step -## Preparing for a release +## Requirements -Before running the release candidate, we want to remove any APIs that were marked for removal under the @deprecated tag for this release. +* A GPG key must be registered and published in the [Apache Iceberg KEYS file](https://downloads.apache.org/iceberg/KEYS). Follow [the instructions for setting up a GPG key and uploading it to the KEYS file](#set-up-gpg-key-and-upload-to-apache-iceberg-keys-file). +* SVN Access + * Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Commmitter access). + * Permission to upload artifacts to the [Apache release distribution](https://dist.apache.org/repos/dist/release/iceberg/) (requires Apache PMC access). +* PyPI Access + * The `twine` package must be installed for uploading releases to PyPi. + * A PyPI account with publishing permissions for the [pyiceberg project](https://pypi.org/project/pyiceberg/). + +## Preparing for a Release + +### Remove Deprecated APIs + +Before running the release candidate, we want to remove any APIs that were marked for removal under the `@deprecated` tag for this release. See [#1269](https://github.com/apache/iceberg-python/pull/1269). For example, the API with the following deprecation tag should be removed when preparing for the 0.2.0 release. @@ -48,23 +64,49 @@ deprecation_message( ) ``` -## Running a release candidate +### Update Library Version + +Update the version in `pyproject.toml` and `pyiceberg/__init__.py` to match the release version. See [#1276](https://github.com/apache/iceberg-python/pull/1276). + +## Publishing a Release Candidate (RC) + +### Release Types + +#### Major/Minor Release -Make sure that the version is correct in `pyproject.toml` and `pyiceberg/__init__.py`. Correct means that it reflects the version that you want to release. +* Use the `main` branch for the release. +* Includes new features, enhancements, and any necessary backward-compatible changes. +* Examples: `0.8.0`, `0.9.0`, `1.0.0`. -### Setting the tag +#### Patch Release -Make sure that you're on the right branch, and the latest branch: +* Use the branch corresponding to the patch version, such as `pyiceberg-0.8.x`. +* Focuses on critical bug fixes or security patches that maintain backward compatibility. +* Examples: `0.8.1`, `0.8.2`. -For a Major/Minor release, make sure that you're on `main`, for patch versions the branch corresponding to the version that you want to patch, i.e. `pyiceberg-0.6.x`. +To create a patch branch from the latest release tag: ```bash -git checkout -git fetch --all -git reset --hard apache/ +# Fetch all tags +git fetch --tags + +# Assuming 0.8.0 is the latest release tag +git checkout -b pyiceberg-0.8.x pyiceberg-0.8.0 + +# Cherry-pick commits for the upcoming patch release +git cherry-pick ``` -Set the tag on the last commit: +### Create Tag + +Ensure you are on the correct branch: + +* For a major/minor release, use the `main` branch +* For a patch release, use the branch corresponding to the patch version, i.e. `pyiceberg-0.6.x`. + +Create a signed tag: + +Replace `VERSION` and `RC` with the appropriate values for the release. ```bash export RC=rc1 @@ -74,48 +116,49 @@ export VERSION_BRANCH=${VERSION_WITHOUT_RC//./-} export GIT_TAG=pyiceberg-${VERSION} git tag -s ${GIT_TAG} -m "PyIceberg ${VERSION}" -git push apache ${GIT_TAG} - -export GIT_TAG_REF=$(git show-ref ${GIT_TAG}) -export GIT_TAG_HASH=${GIT_TAG_REF:0:40} -export LAST_COMMIT_ID=$(git rev-list ${GIT_TAG} 2> /dev/null | head -n 1) +git push git@github.com:apache/iceberg-python.git ${GIT_TAG} ``` -The `-s` option will sign the commit. If you don't have a key yet, you can find the instructions [here](http://www.apache.org/dev/openpgp.html#key-gen-generate-key). To install gpg on a M1 based Mac, a couple of additional steps are required: . -If you have not published your GPG key in [KEYS](https://downloads.apache.org/iceberg/KEYS) yet, you must publish it before sending the vote email by doing: - -```bash -svn co https://dist.apache.org/repos/dist/release/iceberg icebergsvn -cd icebergsvn -echo "" >> KEYS # append a newline -gpg --list-sigs >> KEYS # append signatures -gpg --armor --export >> KEYS # append public key block -svn commit -m "add key for " -``` +### Publish Release Candidate (RC) -### Upload to Apache SVN +#### Upload to Apache Dev SVN -Both the source distribution (`sdist`) and the binary distributions (`wheels`) need to be published for the RC. The wheels are convenient to avoid having people to install compilers locally. The downside is that each architecture requires its own wheel. [use `cibuildwheel`](https://github.com/pypa/cibuildwheel) runs in Github actions to create a wheel for each of the architectures. +##### Create Artifacts for SVN -Before committing the files to the Apache SVN artifact distribution SVN hashes need to be generated, and those need to be signed with gpg to make sure that they are authentic. +Run the [`Python release` Github Action](https://github.com/apache/iceberg-python/actions/workflows/python-release.yml). -Go to [Github Actions and run the `Python release` action](https://github.com/apache/iceberg-python/actions/workflows/python-release.yml). **Set the version to main, since we cannot modify the source**. +* Tag: Use the newly created tag. +* Version: Set the `version` to `main`, as the source cannot be modified. ![Github Actions Run Workflow for SVN Upload](assets/images/ghactions-run-workflow-svn-upload.png) -Download the zip, and sign the files: +This action will generate: + +* Source distribution (`sdist`) +* Binary distributions (`wheels`) for each architectures. These are created using [`cibuildwheel`](https://github.com/pypa/cibuildwheel) + +##### Download Artifacts, Sign, and Generate Checksums + +Download the ZIP file containing the artifacts from the GitHub Actions run and unzip it. + +Navigate to the release directory. Sign the files and generate checksums: + +* `.asc` files: GPG-signed versions of each artifact to ensure authenticity. +* `.sha512` files: SHA-512 checksums for verifying file integrity. ```bash cd release-main/ for name in $(ls pyiceberg-*.whl pyiceberg-*.tar.gz) do - gpg --yes --armor --local-user fokko@apache.org --output "${name}.asc" --detach-sig "${name}" + gpg --yes --armor --output "${name}.asc" --detach-sig "${name}" shasum -a 512 "${name}" > "${name}.sha512" done ``` -Now we can upload the files from the same directory: +##### Upload Artifacts to Apache Dev SVN + +Now, upload the files from the same directory: ```bash export SVN_TMP_DIR=/tmp/iceberg-${VERSION_BRANCH}/ @@ -128,21 +171,59 @@ svn add $SVN_TMP_DIR_VERSIONED svn ci -m "PyIceberg ${VERSION}" ${SVN_TMP_DIR_VERSIONED} ``` -### Upload to PyPi +Verify the artifact is uploaded to [https://dist.apache.org/repos/dist/dev/iceberg](https://dist.apache.org/repos/dist/dev/iceberg/). + +##### Remove Old Artifacts From Apache Dev SVN + +Clean up old RC artifacts: + +```bash +svn delete https://dist.apache.org/repos/dist/dev/iceberg/pyiceberg- -m "Remove old RC artifacts" +``` + +#### Upload to PyPi -Go to Github Actions and run the `Python release` action again. This time, set the **version** of the release candidate as the input: e.g. `0.7.0rc1`. Download the zip and unzip it locally. +##### Create Artifacts for PyPi + +Run the [`Python release` Github Action](https://github.com/apache/iceberg-python/actions/workflows/python-release.yml). + +* Tag: Use the newly created tag. +* Version: Set the `version` to release candidate, e.g. `0.7.0rc1`. ![Github Actions Run Workflow for PyPi Upload](assets/images/ghactions-run-workflow-pypi-upload.png) -Next step is to upload them to pypi. Please keep in mind that this **won't** bump the version for everyone that hasn't pinned their version, since it is set to an RC [pre-release and those are ignored](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#pre-release-versioning). +##### Download Artifacts + +Download the zip file from the Github Action run and unzip locally. + +##### Upload Artifacts to PyPi + +Upload release candidate to PyPi. This **won't** bump the version for everyone that hasn't pinned their version, since it is set to an RC [pre-release and those are ignored](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#pre-release-versioning). + + + +!!! note + `twine` might require an PyPi API token. + + ```bash -twine upload release-0.7.0rc1/* +twine upload release-${VERSION}/* ``` +Verify the artifact is uploaded to [PyPi](https://pypi.org/project/pyiceberg/#history). + +## Vote + +### Generate Vote Email + Final step is to generate the email to the dev mail list: ```bash +export GIT_TAG_REF=$(git show-ref ${GIT_TAG}) +export GIT_TAG_HASH=${GIT_TAG_REF:0:40} +export LAST_COMMIT_ID=$(git rev-list ${GIT_TAG} 2> /dev/null | head -n 1) + cat << EOF > release-announcement-email.txt To: dev@iceberg.apache.org Subject: [VOTE] Release Apache PyIceberg $VERSION @@ -185,12 +266,19 @@ Please vote in the next 72 hours. [ ] +0 [ ] -1 Do not release this because... EOF - -cat release-announcement-email.txt ``` -## Vote has passed +### Send Vote Email +Verify the content of `release-announcement-email.txt` and send it to `dev@iceberg.apache.org` with the corresponding subject line. + +## Vote has failed + +If there are concerns with the RC, address the issues and generate another RC. + +## Publish the Final Release (Vote has passed) + +A minimum of 3 binding +1 votes is required to pass an RC. Once the vote has been passed, you can close the vote thread by concluding it: ```text @@ -205,36 +293,54 @@ The release candidate has been accepted as PyIceberg . Thanks everyone, Kind regards, ``` -### Copy the artifacts to the release dist +### Upload the accepted RC to Apache Release SVN + -```bash -export RC=rc2 -export VERSION=0.7.0${RC} -export VERSION_WITHOUT_RC=${VERSION/rc?/} +!!! note + Only a PMC member has the permission to upload an artifact to the SVN release dist. + + +```bash export SVN_DEV_DIR_VERSIONED="https://dist.apache.org/repos/dist/dev/iceberg/pyiceberg-${VERSION}" export SVN_RELEASE_DIR_VERSIONED="https://dist.apache.org/repos/dist/release/iceberg/pyiceberg-${VERSION_WITHOUT_RC}" svn mv ${SVN_DEV_DIR_VERSIONED} ${SVN_RELEASE_DIR_VERSIONED} -m "PyIceberg: Add release ${VERSION_WITHOUT_RC}" ``` - +Verify the artifact is uploaded to [https://dist.apache.org/repos/dist/release/iceberg](https://dist.apache.org/repos/dist/release/iceberg/). -!!! note - Only a PMC member has the permission to upload an artifact to the SVN release dist. +### Remove Old Artifacts From Apache Release SVN - +We only want to host the latest release. Clean up old release artifacts: + +```bash +svn delete https://dist.apache.org/repos/dist/release/iceberg/pyiceberg- -m "Remove old release artifacts" +``` ### Upload the accepted release to PyPi The latest version can be pushed to PyPi. Check out the Apache SVN and make sure to publish the right version with `twine`: + + +!!! note + `twine` might require an PyPi API token. + + + ```bash svn checkout https://dist.apache.org/repos/dist/release/iceberg /tmp/iceberg-dist-release/ cd /tmp/iceberg-dist-release/pyiceberg-${VERSION_WITHOUT_RC} twine upload pyiceberg-*.whl pyiceberg-*.tar.gz ``` +Verify the artifact is uploaded to [PyPi](https://pypi.org/project/pyiceberg/#history). + +## Post Release + +### Send out Release Announcement Email + Send out an announcement on the dev mail list: ```text @@ -253,19 +359,19 @@ This Python release can be downloaded from: https://pypi.org/project/pyiceberg/< Thanks to everyone for contributing! ``` -## Release the docs +### Release the docs -A committer triggers the [`Python Docs` Github Actions](https://github.com/apache/iceberg-python/actions/workflows/python-ci-docs.yml) through the UI by selecting the branch that just has been released. This will publish the new docs. +Run the [`Release Docs` Github Action](https://github.com/apache/iceberg-python/actions/workflows/python-release-docs.yml). -## Update the Github template +### Update the Github template Make sure to create a PR to update the [GitHub issues template](https://github.com/apache/iceberg-python/blob/main/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml) with the latest version. -## Update the integration tests +### Update the integration tests Ensure to update the `PYICEBERG_VERSION` in the [Dockerfile](https://github.com/apache/iceberg-python/blob/main/dev/Dockerfile). -## Create a Github Release Note +### Create a Github Release Note Create a [new Release Note](https://github.com/apache/iceberg-python/releases/new) on the iceberg-python Github repository. @@ -278,3 +384,22 @@ Then, select the previous release version as the **Previous tag** to use the dif **Generate release notes**. **Set as the latest release** and **Publish**. + +## Misc + +### Set up GPG key and Upload to Apache Iceberg KEYS file + +To set up GPG key locally, see the instructions [here](http://www.apache.org/dev/openpgp.html#key-gen-generate-key). + +To install gpg on a M1 based Mac, a couple of additional steps are required: . + +Then, published GPG key to the [Apache Iceberg KEYS file](https://downloads.apache.org/iceberg/KEYS): + +```bash +svn co https://dist.apache.org/repos/dist/release/iceberg icebergsvn +cd icebergsvn +echo "" >> KEYS # append a newline +gpg --list-sigs >> KEYS # append signatures +gpg --armor --export >> KEYS # append public key block +svn commit -m "add key for " +``` diff --git a/poetry.lock b/poetry.lock index af1e32d79a..85922de0e3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2562,43 +2562,47 @@ files = [ [[package]] name = "numpy" -version = "1.26.0" +version = "1.26.4" description = "Fundamental package for array computing in Python" optional = true -python-versions = "<3.13,>=3.9" -files = [ - {file = "numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8db2f125746e44dce707dd44d4f4efeea8d7e2b43aace3f8d1f235cfa2733dd"}, - {file = "numpy-1.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0621f7daf973d34d18b4e4bafb210bbaf1ef5e0100b5fa750bd9cde84c7ac292"}, - {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51be5f8c349fdd1a5568e72713a21f518e7d6707bcf8503b528b88d33b57dc68"}, - {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:767254ad364991ccfc4d81b8152912e53e103ec192d1bb4ea6b1f5a7117040be"}, - {file = "numpy-1.26.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:436c8e9a4bdeeee84e3e59614d38c3dbd3235838a877af8c211cfcac8a80b8d3"}, - {file = "numpy-1.26.0-cp310-cp310-win32.whl", hash = "sha256:c2e698cb0c6dda9372ea98a0344245ee65bdc1c9dd939cceed6bb91256837896"}, - {file = "numpy-1.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:09aaee96c2cbdea95de76ecb8a586cb687d281c881f5f17bfc0fb7f5890f6b91"}, - {file = "numpy-1.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:637c58b468a69869258b8ae26f4a4c6ff8abffd4a8334c830ffb63e0feefe99a"}, - {file = "numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:306545e234503a24fe9ae95ebf84d25cba1fdc27db971aa2d9f1ab6bba19a9dd"}, - {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6adc33561bd1d46f81131d5352348350fc23df4d742bb246cdfca606ea1208"}, - {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e062aa24638bb5018b7841977c360d2f5917268d125c833a686b7cbabbec496c"}, - {file = "numpy-1.26.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:546b7dd7e22f3c6861463bebb000646fa730e55df5ee4a0224408b5694cc6148"}, - {file = "numpy-1.26.0-cp311-cp311-win32.whl", hash = "sha256:c0b45c8b65b79337dee5134d038346d30e109e9e2e9d43464a2970e5c0e93229"}, - {file = "numpy-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:eae430ecf5794cb7ae7fa3808740b015aa80747e5266153128ef055975a72b99"}, - {file = "numpy-1.26.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:166b36197e9debc4e384e9c652ba60c0bacc216d0fc89e78f973a9760b503388"}, - {file = "numpy-1.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f042f66d0b4ae6d48e70e28d487376204d3cbf43b84c03bac57e28dac6151581"}, - {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5e18e5b14a7560d8acf1c596688f4dfd19b4f2945b245a71e5af4ddb7422feb"}, - {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6bad22a791226d0a5c7c27a80a20e11cfe09ad5ef9084d4d3fc4a299cca505"}, - {file = "numpy-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4acc65dd65da28060e206c8f27a573455ed724e6179941edb19f97e58161bb69"}, - {file = "numpy-1.26.0-cp312-cp312-win32.whl", hash = "sha256:bb0d9a1aaf5f1cb7967320e80690a1d7ff69f1d47ebc5a9bea013e3a21faec95"}, - {file = "numpy-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:ee84ca3c58fe48b8ddafdeb1db87388dce2c3c3f701bf447b05e4cfcc3679112"}, - {file = "numpy-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a873a8180479bc829313e8d9798d5234dfacfc2e8a7ac188418189bb8eafbd2"}, - {file = "numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:914b28d3215e0c721dc75db3ad6d62f51f630cb0c277e6b3bcb39519bed10bd8"}, - {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78a22e95182fb2e7874712433eaa610478a3caf86f28c621708d35fa4fd6e7f"}, - {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f737708b366c36b76e953c46ba5827d8c27b7a8c9d0f471810728e5a2fe57c"}, - {file = "numpy-1.26.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b44e6a09afc12952a7d2a58ca0a2429ee0d49a4f89d83a0a11052da696440e49"}, - {file = "numpy-1.26.0-cp39-cp39-win32.whl", hash = "sha256:5671338034b820c8d58c81ad1dafc0ed5a00771a82fccc71d6438df00302094b"}, - {file = "numpy-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:020cdbee66ed46b671429c7265cf00d8ac91c046901c55684954c3958525dab2"}, - {file = "numpy-1.26.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0792824ce2f7ea0c82ed2e4fecc29bb86bee0567a080dacaf2e0a01fe7654369"}, - {file = "numpy-1.26.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d484292eaeb3e84a51432a94f53578689ffdea3f90e10c8b203a99be5af57d8"}, - {file = "numpy-1.26.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:186ba67fad3c60dbe8a3abff3b67a91351100f2661c8e2a80364ae6279720299"}, - {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"}, +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] [[package]] @@ -4569,5 +4573,5 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" -python-versions = "^3.9, <3.13, !=3.9.7" -content-hash = "54d7d52db7c08c6474f28aa9f62cb7f3d745c0341969db0ccb76b0195b3372a2" +python-versions = "^3.9, !=3.9.7" +content-hash = "c711643812ed5d98298621a7b46050cd1d2a8a7f6c288de9e1d7d20a94bb1a69" diff --git a/pyiceberg/__init__.py b/pyiceberg/__init__.py index 42c6e12f1b..0e0cb642df 100644 --- a/pyiceberg/__init__.py +++ b/pyiceberg/__init__.py @@ -15,4 +15,4 @@ # specific language governing permissions and limitations # under the License. -__version__ = "0.8.0" +__version__ = "0.8.1" diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 5742173fa6..ca7b3a4184 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -459,7 +459,7 @@ def commit_table( NoSuchTableError: If a table with the given identifier does not exist. CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ - table_identifier = self._identifier_to_tuple_without_catalog(table.identifier) + table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) current_glue_table: Optional[TableTypeDef] @@ -773,4 +773,4 @@ def drop_view(self, identifier: Union[str, Identifier]) -> None: @staticmethod def __is_iceberg_table(table: TableTypeDef) -> bool: - return table.get("Parameters", {}).get("table_type", "").lower() == ICEBERG + return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 030470e164..d400901160 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -314,7 +314,7 @@ def _convert_hive_into_iceberg(self, table: HiveTable) -> Table: ) def _convert_iceberg_into_hive(self, table: Table) -> HiveTable: - identifier_tuple = self._identifier_to_tuple_without_catalog(table.identifier) + identifier_tuple = table.name() database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) current_time_millis = int(time.time() * 1000) @@ -455,7 +455,7 @@ def commit_table( NoSuchTableError: If a table with the given identifier does not exist. CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ - table_identifier = self._identifier_to_tuple_without_catalog(table.identifier) + table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) # commit to hive # https://github.com/apache/hive/blob/master/standalone-metastore/metastore-common/src/main/thrift/hive_metastore.thrift#L1232 @@ -651,7 +651,7 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: for table in open_client.get_table_objects_by_name( dbname=database_name, tbl_names=open_client.get_all_tables(db_name=database_name) ) - if table.parameters[TABLE_TYPE].lower() == ICEBERG + if table.parameters.get(TABLE_TYPE, "").lower() == ICEBERG ] def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 2b48330bfc..e2584921ea 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -775,7 +775,7 @@ def commit_table( CommitFailedException: Requirement not met, or a conflict with a concurrent commit. CommitStateUnknownException: Failed due to an internal exception on the side of the catalog. """ - identifier = self._identifier_to_tuple_without_catalog(table.identifier) + identifier = table.name() table_identifier = TableIdentifier(namespace=identifier[:-1], name=identifier[-1]) table_request = CommitTableRequest(identifier=table_identifier, requirements=requirements, updates=updates) @@ -899,7 +899,7 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: @retry(**_RETRY_ARGS) def drop_view(self, identifier: Union[str]) -> None: - identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) + identifier_tuple = self._identifier_to_tuple_without_catalog(identifier) response = self._session.delete( self.url( Endpoints.drop_view, prefixed=True, **self._split_identifier_for_path(identifier_tuple, IdentifierKind.VIEW) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 6a4318253f..9776cc6bec 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -419,7 +419,7 @@ def commit_table( NoSuchTableError: If a table with the given identifier does not exist. CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ - table_identifier = self._identifier_to_tuple_without_catalog(table.identifier) + table_identifier = table.name() namespace_tuple = Catalog.namespace_from(table_identifier) namespace = Catalog.namespace_to_string(namespace_tuple) table_name = Catalog.table_name_from(table_identifier) @@ -430,7 +430,7 @@ def commit_table( except NoSuchTableError: current_table = None - updated_staged_table = self._update_and_stage_table(current_table, table.identifier, requirements, updates) + updated_staged_table = self._update_and_stage_table(current_table, table.name(), requirements, updates) if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index 56b544c99f..13a15c53f9 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -137,7 +137,7 @@ def files(self, table: Table, history: bool) -> None: else: snapshots = [] - snapshot_tree = Tree(f"Snapshots: {'.'.join(table.identifier)}") + snapshot_tree = Tree(f"Snapshots: {'.'.join(table.name())}") io = table.io for snapshot in snapshots: @@ -216,7 +216,7 @@ class FauxTable(IcebergBaseModel): print( FauxTable( - identifier=table.identifier, metadata=table.metadata, metadata_location=table.metadata_location + identifier=table.name(), metadata=table.metadata, metadata_location=table.metadata_location ).model_dump_json() ) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index dcd8dceb2c..056defefb4 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -79,7 +79,7 @@ NAN = CaselessKeyword("nan") LIKE = CaselessKeyword("like") -unquoted_identifier = Word(alphas, alphanums + "_$") +unquoted_identifier = Word(alphas + "_", alphanums + "_$") quoted_identifier = Suppress('"') + unquoted_identifier + Suppress('"') identifier = MatchFirst([unquoted_identifier, quoted_identifier]).set_results_name("identifier") column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column") diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 9ab1981069..23aec2d35e 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata( split_offsets.sort() for field_id in invalidate_col: - del col_aggs[field_id] - del null_value_counts[field_id] + col_aggs.pop(field_id, None) + null_value_counts.pop(field_id, None) return DataFileStatistics( record_count=parquet_metadata.num_rows, diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 649840fc66..6774499f2e 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -957,7 +957,11 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id super().__init__( format_version=1, output_file=output_file, - meta={"snapshot-id": str(snapshot_id), "parent-snapshot-id": str(parent_snapshot_id), "format-version": "1"}, + meta={ + "snapshot-id": str(snapshot_id), + "parent-snapshot-id": str(parent_snapshot_id) if parent_snapshot_id is not None else "null", + "format-version": "1", + }, ) def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: @@ -976,7 +980,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id output_file=output_file, meta={ "snapshot-id": str(snapshot_id), - "parent-snapshot-id": str(parent_snapshot_id), + "parent-snapshot-id": str(parent_snapshot_id) if parent_snapshot_id is not None else "null", "sequence-number": str(sequence_number), "format-version": "2", }, diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8055082542..3eb74eee1f 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -801,7 +801,7 @@ def name(self) -> Identifier: Returns: An Identifier tuple of the table name """ - return self.identifier + return self._identifier def scan( self, @@ -822,7 +822,7 @@ def scan( row_filter: A string or BooleanExpression that decsribes the desired rows - selected_fileds: + selected_fields: A tuple of strings representing the column names to return in the output dataframe. case_sensitive: diff --git a/pyproject.toml b/pyproject.toml index fe57631fc8..1315ed4f2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [tool.poetry] name = "pyiceberg" -version = "0.8.0" +version = "0.8.1" readme = "README.md" homepage = "https://py.iceberg.apache.org/" repository = "https://github.com/apache/iceberg-python" @@ -49,7 +49,7 @@ include = [ ] [tool.poetry.dependencies] -python = "^3.9, <3.13, !=3.9.7" +python = "^3.9, !=3.9.7" mmh3 = ">=4.0.0,<6.0.0" requests = ">=2.20.0,<3.0.0" click = ">=7.1.1,<9.0.0" @@ -57,7 +57,7 @@ rich = ">=10.11.0,<14.0.0" strictyaml = ">=1.7.0,<2.0.0" # CVE-2020-14343 was fixed in 5.4. pydantic = ">=2.0,<3.0,!=2.4.0,!=2.4.1" # 2.4.0, 2.4.1 has a critical bug sortedcontainers = "2.4.0" -fsspec = ">=2023.1.0,<2025.1.0" +fsspec = ">=2023.1.0" pyparsing = ">=3.1.0,<4.0.0" zstandard = ">=0.13.0,<1.0.0" tenacity = ">=8.2.3,<10.0.0" @@ -72,9 +72,9 @@ python-snappy = { version = ">=0.6.0,<1.0.0", optional = true } thrift = { version = ">=0.13.0,<1.0.0", optional = true } mypy-boto3-glue = { version = ">=1.28.18", optional = true } boto3 = { version = ">=1.24.59", optional = true } -s3fs = { version = ">=2023.1.0,<2024.1.0", optional = true } -adlfs = { version = ">=2023.1.0,<2024.8.0", optional = true } -gcsfs = { version = ">=2023.1.0,<2024.1.0", optional = true } +s3fs = { version = ">=2023.1.0", optional = true } +adlfs = { version = ">=2023.1.0", optional = true } +gcsfs = { version = ">=2023.1.0", optional = true } psycopg2-binary = { version = ">=2.9.6", optional = true } sqlalchemy = { version = "^2.0.18", optional = true } getdaft = { version = ">=0.2.12", optional = true } diff --git a/tests/catalog/integration_test_dynamodb.py b/tests/catalog/integration_test_dynamodb.py index 05d51bb0ef..895f233c45 100644 --- a/tests/catalog/integration_test_dynamodb.py +++ b/tests/catalog/integration_test_dynamodb.py @@ -57,7 +57,7 @@ def test_create_table( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested, get_s3_path(get_bucket_name(), database_name, table_name)) table = test_catalog.load_table(identifier) - assert table.identifier == (test_catalog.name,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) @@ -78,7 +78,7 @@ def test_create_table_with_default_location( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (test_catalog.name,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) @@ -102,7 +102,7 @@ def test_create_table_if_not_exists_duplicated_table( test_catalog.create_namespace(database_name) table1 = test_catalog.create_table((database_name, table_name), table_schema_nested) table2 = test_catalog.create_table_if_not_exists((database_name, table_name), table_schema_nested) - assert table1.identifier == table2.identifier + assert table1.name() == table2.name() def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_name: str) -> None: @@ -110,7 +110,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database test_catalog.create_namespace(database_name) table = test_catalog.create_table(identifier, table_schema_nested) loaded_table = test_catalog.load_table(identifier) - assert table.identifier == loaded_table.identifier + assert table.name() == loaded_table.name() assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata @@ -134,11 +134,11 @@ def test_rename_table( new_table_name = f"rename-{table_name}" identifier = (database_name, table_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (test_catalog.name,) + identifier + assert table.name() == identifier new_identifier = (new_database_name, new_table_name) test_catalog.rename_table(identifier, new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (test_catalog.name,) + new_identifier + assert new_table.name() == new_identifier assert new_table.metadata_location == table.metadata_location metadata_location = new_table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) @@ -150,7 +150,7 @@ def test_drop_table(test_catalog: Catalog, table_schema_nested: Schema, table_na identifier = (database_name, table_name) test_catalog.create_namespace(database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (test_catalog.name,) + identifier + assert table.name() == identifier test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) @@ -163,7 +163,7 @@ def test_purge_table( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (test_catalog.name,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) test_catalog.purge_table(identifier) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index a5293e38f2..475fc07ead 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -119,7 +119,7 @@ def test_create_table( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested, get_s3_path(get_bucket_name(), database_name, table_name)) table = test_catalog.load_table(identifier) - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 @@ -183,7 +183,7 @@ def test_create_table_with_default_location( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 @@ -208,7 +208,7 @@ def test_create_table_if_not_exists_duplicated_table( test_catalog.create_namespace(database_name) table1 = test_catalog.create_table((database_name, table_name), table_schema_nested) table2 = test_catalog.create_table_if_not_exists((database_name, table_name), table_schema_nested) - assert table1.identifier == table2.identifier + assert table1.name() == table2.name() def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, table_name: str, database_name: str) -> None: @@ -216,7 +216,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, table_na test_catalog.create_namespace(database_name) table = test_catalog.create_table(identifier, table_schema_nested) loaded_table = test_catalog.load_table(identifier) - assert table.identifier == loaded_table.identifier + assert table.name() == loaded_table.name() assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 @@ -242,11 +242,11 @@ def test_rename_table( identifier = (database_name, table_name) table = test_catalog.create_table(identifier, table_schema_nested) assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier new_identifier = (new_database_name, new_table_name) test_catalog.rename_table(identifier, new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (CATALOG_NAME,) + new_identifier + assert new_table.name() == new_identifier assert new_table.metadata_location == table.metadata_location metadata_location = new_table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) @@ -258,7 +258,7 @@ def test_drop_table(test_catalog: Catalog, table_schema_nested: Schema, table_na identifier = (database_name, table_name) test_catalog.create_namespace(database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) @@ -271,7 +271,7 @@ def test_purge_table( test_catalog.create_namespace(database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) test_catalog.purge_table(identifier) @@ -536,7 +536,7 @@ def test_create_table_transaction( update_snapshot.append_data_file(data_file) table = test_catalog.load_table(identifier) - assert table.identifier == (CATALOG_NAME,) + identifier + assert table.name() == identifier metadata_location = table.metadata_location.split(get_bucket_name())[1][1:] s3.head_object(Bucket=get_bucket_name(), Key=metadata_location) assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 @@ -584,6 +584,6 @@ def test_register_table_with_given_location( test_catalog.drop_table(identifier) # drops the table but keeps the metadata file assert not test_catalog.table_exists(identifier) table = test_catalog.register_table(new_identifier, location) - assert table.identifier == (CATALOG_NAME,) + new_identifier + assert table.name() == new_identifier assert table.metadata_location == location assert test_catalog.table_exists(new_identifier) diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index e212854ee2..92e0a7236b 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -133,7 +133,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: def commit_table( self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] ) -> CommitTableResponse: - identifier_tuple = self._identifier_to_tuple_without_catalog(table.identifier) + identifier_tuple = table.name() current_table = self.load_table(identifier_tuple) base_metadata = current_table.metadata diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index 0f89d12642..7ab875af90 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -73,7 +73,7 @@ def test_create_table_with_database_location( test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -90,7 +90,7 @@ def test_create_table_with_pyarrow_schema( test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) table = test_catalog.create_table(identifier, pyarrow_schema_simple_without_ids) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -103,7 +103,7 @@ def test_create_table_with_default_warehouse( test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -118,7 +118,7 @@ def test_create_table_with_given_location( table = test_catalog.create_table( identifier=identifier, schema=table_schema_nested, location=f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" ) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -132,7 +132,7 @@ def test_create_table_removes_trailing_slash_in_location( test_catalog.create_namespace(namespace=database_name) location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/") - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert table.location() == location assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -157,7 +157,7 @@ def test_create_table_with_strips( test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db/"}) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -170,7 +170,7 @@ def test_create_table_with_strips_bucket_root( test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) table_strip = test_catalog.create_table(identifier, table_schema_nested) - assert table_strip.identifier == (catalog_name,) + identifier + assert table_strip.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table_strip.metadata_location) @@ -205,7 +205,7 @@ def test_create_table_if_not_exists_duplicated_table( test_catalog.create_namespace(namespace=database_name) table1 = test_catalog.create_table(identifier, table_schema_nested) table2 = test_catalog.create_table_if_not_exists(identifier, table_schema_nested) - assert table1.identifier == table2.identifier + assert table1.name() == table2.name() @mock_aws @@ -218,7 +218,7 @@ def test_load_table( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -232,8 +232,8 @@ def test_load_table_from_self_identifier( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) intermediate = test_catalog.load_table(identifier) - table = test_catalog.load_table(intermediate.identifier) - assert table.identifier == (catalog_name,) + identifier + table = test_catalog.load_table(intermediate.name()) + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -256,7 +256,7 @@ def test_drop_table( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): @@ -273,13 +273,13 @@ def test_drop_table_from_self_identifier( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) - test_catalog.drop_table(table.identifier) + test_catalog.drop_table(table.name()) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) with pytest.raises(NoSuchTableError): - test_catalog.load_table(table.identifier) + test_catalog.load_table(table.name()) @mock_aws @@ -301,11 +301,11 @@ def test_rename_table( test_catalog = DynamoDbCatalog(catalog_name, **{"warehouse": f"s3://{BUCKET_NAME}", "s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) test_catalog.rename_table(identifier, new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (catalog_name,) + new_identifier + assert new_table.name() == new_identifier # the metadata_location should not change assert new_table.metadata_location == table.metadata_location # old table should be dropped @@ -324,18 +324,18 @@ def test_rename_table_from_self_identifier( test_catalog = DynamoDbCatalog(catalog_name, **{"warehouse": f"s3://{BUCKET_NAME}", "s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) - test_catalog.rename_table(table.identifier, new_identifier) + test_catalog.rename_table(table.name(), new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (catalog_name,) + new_identifier + assert new_table.name() == new_identifier # the metadata_location should not change assert new_table.metadata_location == table.metadata_location # old table should be dropped with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) with pytest.raises(NoSuchTableError): - test_catalog.load_table(table.identifier) + test_catalog.load_table(table.name()) @mock_aws diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 26c80bc968..37c22c3fc6 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -59,7 +59,7 @@ def test_create_table_with_database_location( test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -121,7 +121,7 @@ def test_create_table_with_default_warehouse( test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -137,7 +137,7 @@ def test_create_table_with_given_location( table = test_catalog.create_table( identifier=identifier, schema=table_schema_nested, location=f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" ) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -152,7 +152,7 @@ def test_create_table_removes_trailing_slash_in_location( test_catalog.create_namespace(namespace=database_name) location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/") - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert table.location() == location assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -175,7 +175,7 @@ def test_create_table_with_pyarrow_schema( schema=pyarrow_schema_simple_without_ids, location=f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}", ) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -201,7 +201,7 @@ def test_create_table_with_strips( test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db/"}) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -210,12 +210,11 @@ def test_create_table_with_strips( def test_create_table_with_strips_bucket_root( _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str ) -> None: - catalog_name = "glue" identifier = (database_name, table_name) test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) table_strip = test_catalog.create_table(identifier, table_schema_nested) - assert table_strip.identifier == (catalog_name,) + identifier + assert table_strip.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table_strip.metadata_location) assert test_catalog._parse_metadata_version(table_strip.metadata_location) == 0 @@ -242,7 +241,7 @@ def test_create_table_with_glue_catalog_id( ) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -273,7 +272,7 @@ def test_load_table( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 @@ -287,8 +286,8 @@ def test_load_table_from_self_identifier( test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) intermediate = test_catalog.create_table(identifier, table_schema_nested) - table = test_catalog.load_table(intermediate.identifier) - assert table.identifier == (catalog_name,) + identifier + table = test_catalog.load_table(intermediate.name()) + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) @@ -311,7 +310,7 @@ def test_drop_table( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) test_catalog.drop_table(identifier) with pytest.raises(NoSuchTableError): @@ -328,13 +327,13 @@ def test_drop_table_from_self_identifier( test_catalog.create_namespace(namespace=database_name) test_catalog.create_table(identifier, table_schema_nested) table = test_catalog.load_table(identifier) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) - test_catalog.drop_table(table.identifier) + test_catalog.drop_table(table.name()) with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) with pytest.raises(NoSuchTableError): - test_catalog.load_table(table.identifier) + test_catalog.load_table(table.name()) @mock_aws @@ -349,19 +348,18 @@ def test_drop_non_exist_table(_bucket_initialize: None, moto_endpoint_url: str, def test_rename_table( _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str ) -> None: - catalog_name = "glue" new_table_name = f"{table_name}_new" identifier = (database_name, table_name) new_identifier = (database_name, new_table_name) test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) assert test_catalog._parse_metadata_version(table.metadata_location) == 0 test_catalog.rename_table(identifier, new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (catalog_name,) + new_identifier + assert new_table.name() == new_identifier # the metadata_location should not change assert new_table.metadata_location == table.metadata_location # old table should be dropped @@ -373,25 +371,24 @@ def test_rename_table( def test_rename_table_from_self_identifier( _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str ) -> None: - catalog_name = "glue" new_table_name = f"{table_name}_new" identifier = (database_name, table_name) new_identifier = (database_name, new_table_name) test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) table = test_catalog.create_table(identifier, table_schema_nested) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) - test_catalog.rename_table(table.identifier, new_identifier) + test_catalog.rename_table(table.name(), new_identifier) new_table = test_catalog.load_table(new_identifier) - assert new_table.identifier == (catalog_name,) + new_identifier + assert new_table.name() == new_identifier # the metadata_location should not change assert new_table.metadata_location == table.metadata_location # old table should be dropped with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) with pytest.raises(NoSuchTableError): - test_catalog.load_table(table.identifier) + test_catalog.load_table(table.name()) @mock_aws @@ -449,6 +446,7 @@ def test_list_tables( test_catalog.create_namespace(namespace=database_name) non_iceberg_table_name = "non_iceberg_table" + non_table_type_table_name = "non_table_type_table" glue_client = boto3.client("glue", endpoint_url=moto_endpoint_url) glue_client.create_table( DatabaseName=database_name, @@ -458,12 +456,21 @@ def test_list_tables( "Parameters": {"table_type": "noniceberg"}, }, ) + glue_client.create_table( + DatabaseName=database_name, + TableInput={ + "Name": non_table_type_table_name, + "TableType": "OTHER_TABLE_TYPE", + "Parameters": {}, + }, + ) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) loaded_table_list = test_catalog.list_tables(database_name) assert (database_name, non_iceberg_table_name) not in loaded_table_list + assert (database_name, non_table_type_table_name) not in loaded_table_list for table_name in table_list: assert (database_name, table_name) in loaded_table_list @@ -914,7 +921,7 @@ def test_register_table_with_given_location( test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) table = test_catalog.register_table(identifier, location) - assert table.identifier == (catalog_name,) + identifier + assert table.name() == identifier assert test_catalog.table_exists(identifier) is True diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index 7756611dd7..f60cc38b15 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -699,7 +699,7 @@ def test_load_table(hive_table: HiveTable) -> None: last_sequence_number=34, ) - assert table.identifier == (HIVE_CATALOG_NAME, "default", "new_tabl2e") + assert table.name() == ("default", "new_tabl2e") assert expected == table.metadata @@ -709,7 +709,7 @@ def test_load_table_from_self_identifier(hive_table: HiveTable) -> None: catalog._client = MagicMock() catalog._client.__enter__().get_table.return_value = hive_table intermediate = catalog.load_table(("default", "new_tabl2e")) - table = catalog.load_table(intermediate.identifier) + table = catalog.load_table(intermediate.name()) catalog._client.__enter__().get_table.assert_called_with(dbname="default", tbl_name="new_tabl2e") @@ -800,7 +800,7 @@ def test_load_table_from_self_identifier(hive_table: HiveTable) -> None: last_sequence_number=34, ) - assert table.identifier == (HIVE_CATALOG_NAME, "default", "new_tabl2e") + assert table.name() == ("default", "new_tabl2e") assert expected == table.metadata @@ -819,7 +819,7 @@ def test_rename_table(hive_table: HiveTable) -> None: to_identifier = ("default", "new_tabl3e") table = catalog.rename_table(from_identifier, to_identifier) - assert table.identifier == ("hive",) + to_identifier + assert table.name() == to_identifier calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")] catalog._client.__enter__().get_table.assert_has_calls(calls) @@ -843,9 +843,9 @@ def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None: catalog._client.__enter__().get_table.side_effect = [hive_table, renamed_table] catalog._client.__enter__().alter_table.return_value = None to_identifier = ("default", "new_tabl3e") - table = catalog.rename_table(from_table.identifier, to_identifier) + table = catalog.rename_table(from_table.name(), to_identifier) - assert table.identifier == ("hive",) + to_identifier + assert table.name() == to_identifier calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")] catalog._client.__enter__().get_table.assert_has_calls(calls) @@ -919,16 +919,20 @@ def test_list_tables(hive_table: HiveTable) -> None: tbl3.tableName = "table3" tbl3.dbName = "database" tbl3.parameters["table_type"] = "non_iceberg" + tbl4 = deepcopy(hive_table) + tbl4.tableName = "table4" + tbl4.dbName = "database" + tbl4.parameters.pop("table_type") catalog._client = MagicMock() - catalog._client.__enter__().get_all_tables.return_value = ["table1", "table2", "table3"] - catalog._client.__enter__().get_table_objects_by_name.return_value = [tbl1, tbl2, tbl3] + catalog._client.__enter__().get_all_tables.return_value = ["table1", "table2", "table3", "table4"] + catalog._client.__enter__().get_table_objects_by_name.return_value = [tbl1, tbl2, tbl3, tbl4] got_tables = catalog.list_tables("database") assert got_tables == [("database", "table1"), ("database", "table2")] catalog._client.__enter__().get_all_tables.assert_called_with(db_name="database") catalog._client.__enter__().get_table_objects_by_name.assert_called_with( - dbname="database", tbl_names=["table1", "table2", "table3"] + dbname="database", tbl_names=["table1", "table2", "table3", "table4"] ) @@ -962,7 +966,7 @@ def test_drop_table_from_self_identifier(hive_table: HiveTable) -> None: table = catalog.load_table(("default", "new_tabl2e")) catalog._client.__enter__().get_all_databases.return_value = ["namespace1", "namespace2"] - catalog.drop_table(table.identifier) + catalog.drop_table(table.name()) catalog._client.__enter__().drop_table.assert_called_with(dbname="default", name="new_tabl2e", deleteData=False) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index f8662c1bf4..5d633ed006 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -763,7 +763,7 @@ def test_load_table_from_self_identifier_200( ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) table = catalog.load_table(("pdames", "table")) - actual = catalog.load_table(table.identifier) + actual = catalog.load_table(table.name()) expected = Table( identifier=("pdames", "table"), metadata_location=example_table_metadata_with_snapshot_v1_rest_json["metadata-location"], @@ -1111,7 +1111,7 @@ def test_register_table_200( ) assert actual.metadata.model_dump() == expected.metadata.model_dump() assert actual.metadata_location == expected.metadata_location - assert actual.identifier == expected.identifier + assert actual.name() == expected.name() def test_register_table_409(rest_mock: Mocker, table_schema_simple: Schema) -> None: @@ -1174,7 +1174,7 @@ def test_delete_table_from_self_identifier_204( status_code=204, request_headers=TEST_HEADERS, ) - catalog.drop_table(table.identifier) + catalog.drop_table(table.name()) def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any]) -> None: @@ -1236,7 +1236,7 @@ def test_rename_table_from_self_identifier_200( status_code=200, request_headers=TEST_HEADERS, ) - actual = catalog.rename_table(table.identifier, to_identifier) + actual = catalog.rename_table(table.name(), to_identifier) expected = Table( identifier=("pdames", "destination"), metadata_location=example_table_metadata_with_snapshot_v1_rest_json["metadata-location"], diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index d3815fec04..fcefc597d2 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -493,7 +493,7 @@ def test_create_table_with_default_warehouse_location( catalog.create_namespace(namespace) catalog.create_table(table_identifier, table_schema_nested) table = catalog.load_table(table_identifier) - assert table.identifier == (catalog.name,) + table_identifier_nocatalog + assert table.name() == table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) catalog.drop_table(table_identifier) @@ -524,7 +524,7 @@ def test_create_table_with_given_location_removes_trailing_slash( catalog.create_namespace(namespace) catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/") table = catalog.load_table(table_identifier) - assert table.identifier == (catalog.name,) + table_identifier_nocatalog + assert table.name() == table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) assert table.location() == location @@ -578,7 +578,7 @@ def test_create_table_if_not_exists_duplicated_table( catalog.create_namespace(namespace) table1 = catalog.create_table(table_identifier, table_schema_nested) table2 = catalog.create_table_if_not_exists(table_identifier, table_schema_nested) - assert table1.identifier == table2.identifier + assert table1.name() == table2.name() @pytest.mark.parametrize( @@ -626,7 +626,7 @@ def test_register_table(catalog: SqlCatalog, table_identifier: Identifier, metad namespace = Catalog.namespace_from(table_identifier_nocatalog) catalog.create_namespace(namespace) table = catalog.register_table(table_identifier, metadata_location) - assert table.identifier == (catalog.name,) + table_identifier_nocatalog + assert table.name() == table_identifier_nocatalog assert table.metadata_location == metadata_location assert os.path.exists(metadata_location) catalog.drop_table(table_identifier) @@ -702,7 +702,7 @@ def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, table_iden catalog.create_namespace(namespace) table = catalog.create_table(table_identifier, table_schema_nested) loaded_table = catalog.load_table(table_identifier) - assert table.identifier == loaded_table.identifier + assert table.name() == loaded_table.name() assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata @@ -728,9 +728,9 @@ def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste catalog.create_namespace(namespace) table = catalog.create_table(table_identifier, table_schema_nested) intermediate = catalog.load_table(table_identifier) - assert intermediate.identifier == (catalog.name,) + table_identifier_nocatalog - loaded_table = catalog.load_table(intermediate.identifier) - assert table.identifier == loaded_table.identifier + assert intermediate.name() == table_identifier_nocatalog + loaded_table = catalog.load_table(intermediate.name()) + assert table.name() == loaded_table.name() assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata @@ -756,7 +756,7 @@ def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, table_iden namespace = Catalog.namespace_from(table_identifier_nocatalog) catalog.create_namespace(namespace) table = catalog.create_table(table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + table_identifier_nocatalog + assert table.name() == table_identifier_nocatalog catalog.drop_table(table_identifier) with pytest.raises(NoSuchTableError): catalog.load_table(table_identifier) @@ -783,10 +783,10 @@ def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste namespace = Catalog.namespace_from(table_identifier_nocatalog) catalog.create_namespace(namespace) table = catalog.create_table(table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + table_identifier_nocatalog - catalog.drop_table(table.identifier) + assert table.name() == table_identifier_nocatalog + catalog.drop_table(table.name()) with pytest.raises(NoSuchTableError): - catalog.load_table(table.identifier) + catalog.load_table(table.name()) with pytest.raises(NoSuchTableError): catalog.load_table(table_identifier) @@ -846,10 +846,10 @@ def test_rename_table( catalog.create_namespace(from_namespace) catalog.create_namespace(to_namespace) table = catalog.create_table(from_table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + assert table.name() == from_table_identifier_nocatalog catalog.rename_table(from_table_identifier, to_table_identifier) new_table = catalog.load_table(to_table_identifier) - assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog + assert new_table.name() == to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): catalog.load_table(from_table_identifier) @@ -889,13 +889,13 @@ def test_rename_table_from_self_identifier( catalog.create_namespace(from_namespace) catalog.create_namespace(to_namespace) table = catalog.create_table(from_table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog - catalog.rename_table(table.identifier, to_table_identifier) + assert table.name() == from_table_identifier_nocatalog + catalog.rename_table(table.name(), to_table_identifier) new_table = catalog.load_table(to_table_identifier) - assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog + assert new_table.name() == to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): - catalog.load_table(table.identifier) + catalog.load_table(table.name()) with pytest.raises(NoSuchTableError): catalog.load_table(from_table_identifier) @@ -934,9 +934,9 @@ def test_rename_table_to_existing_one( catalog.create_namespace(from_namespace) catalog.create_namespace(to_namespace) table = catalog.create_table(from_table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + assert table.name() == from_table_identifier_nocatalog new_table = catalog.create_table(to_table_identifier, table_schema_nested) - assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog + assert new_table.name() == to_table_identifier_nocatalog with pytest.raises(TableAlreadyExistsError): catalog.rename_table(from_table_identifier, to_table_identifier) @@ -1004,7 +1004,7 @@ def test_rename_table_to_missing_namespace( from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) catalog.create_namespace(from_namespace) table = catalog.create_table(from_table_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + assert table.name() == from_table_identifier_nocatalog with pytest.raises(NoSuchNamespaceError): catalog.rename_table(from_table_identifier, to_table_identifier) diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 6096b10fd4..085150edec 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -53,6 +53,10 @@ def test_quoted_column() -> None: assert EqualTo("foo", True) == parser.parse('"foo" = TRUE') +def test_leading_underscore() -> None: + assert EqualTo("_foo", True) == parser.parse("_foo = true") + + def test_equals_true() -> None: assert EqualTo("foo", True) == parser.parse("foo = true") assert EqualTo("foo", True) == parser.parse("foo == TRUE") diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 3cf17c0e8c..006e1f3af1 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -673,7 +673,7 @@ def test_hive_locking(session_catalog_hive: HiveCatalog) -> None: database_name: str table_name: str - _, database_name, table_name = table.identifier + database_name, table_name = table.name() hive_client: _HiveClient = _HiveClient(session_catalog_hive.properties["uri"]) blocking_lock_request: LockRequest = session_catalog_hive._create_lock_request(database_name, table_name) @@ -694,7 +694,7 @@ def test_hive_locking_with_retry(session_catalog_hive: HiveCatalog) -> None: table = create_table(session_catalog_hive) database_name: str table_name: str - _, database_name, table_name = table.identifier + database_name, table_name = table.name() session_catalog_hive._lock_check_min_wait_time = 0.1 session_catalog_hive._lock_check_max_wait_time = 0.5 session_catalog_hive._lock_check_retries = 5 diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index 41f1432dbf..788891711e 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -81,7 +81,9 @@ class TestStruct: y: Optional[float] -def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: +def construct_test_table( + write_statistics: Union[bool, List[str]] = True, +) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl metadata_collector: List[Any] = [] with pa.BufferOutputStream() as f: - with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: + with pq.ParquetWriter( + f, table.schema, metadata_collector=metadata_collector, write_statistics=write_statistics + ) as writer: writer.write_table(table) return metadata_collector[0], table_metadata @@ -681,6 +685,41 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def test_read_missing_statistics() -> None: + # write statistics for only for "strings" column + metadata, table_metadata = construct_test_table(write_statistics=["strings"]) + + # expect only "strings" column to have statistics in metadata + # and all other columns to have no statistics + for r in range(metadata.num_row_groups): + for pos in range(metadata.num_columns): + if metadata.row_group(r).column(pos).path_in_schema == "strings": + assert metadata.row_group(r).column(pos).is_stats_set is True + assert metadata.row_group(r).column(pos).statistics is not None + else: + assert metadata.row_group(r).column(pos).is_stats_set is False + assert metadata.row_group(r).column(pos).statistics is None + + schema = get_current_schema(table_metadata) + statistics = data_file_statistics_from_parquet_metadata( + parquet_metadata=metadata, + stats_columns=compute_statistics_plan(schema, table_metadata.properties), + parquet_column_mapping=parquet_path_to_id_mapping(schema), + ) + + datafile = DataFile(**statistics.to_serialized_dict()) + + # expect only "strings" column values to be reflected in the + # upper_bound, lower_bound and null_value_counts props of datafile + string_col_idx = 1 + assert len(datafile.lower_bounds) == 1 + assert datafile.lower_bounds[string_col_idx].decode() == "aaaaaaaaaaaaaaaa" + assert len(datafile.upper_bounds) == 1 + assert datafile.upper_bounds[string_col_idx].decode() == "zzzzzzzzzzzzzzz{" + assert len(datafile.null_value_counts) == 1 + assert datafile.null_value_counts[string_col_idx] == 1 + + # This is commented out for now because write_to_dataset drops the partition # columns making it harder to calculate the mapping from the column index to # datatype id diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index bb60ac0a21..97c88a99ee 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -16,7 +16,7 @@ # under the License. # pylint: disable=redefined-outer-name,arguments-renamed,fixme from tempfile import TemporaryDirectory -from typing import Dict +from typing import Dict, Optional from unittest.mock import patch import fastavro @@ -526,14 +526,18 @@ def test_write_manifest( @pytest.mark.parametrize("format_version", [1, 2]) +@pytest.mark.parametrize("parent_snapshot_id", [19, None]) def test_write_manifest_list( - generated_manifest_file_file_v1: str, generated_manifest_file_file_v2: str, format_version: TableVersion + generated_manifest_file_file_v1: str, + generated_manifest_file_file_v2: str, + format_version: TableVersion, + parent_snapshot_id: Optional[int], ) -> None: io = load_file_io() snapshot = Snapshot( snapshot_id=25, - parent_snapshot_id=19, + parent_snapshot_id=parent_snapshot_id, timestamp_ms=1602638573590, manifest_list=generated_manifest_file_file_v1 if format_version == 1 else generated_manifest_file_file_v2, summary=Summary(Operation.APPEND), @@ -545,12 +549,20 @@ def test_write_manifest_list( path = tmp_dir + "/manifest-list.avro" output = io.new_output(path) with write_manifest_list( - format_version=format_version, output_file=output, snapshot_id=25, parent_snapshot_id=19, sequence_number=0 + format_version=format_version, + output_file=output, + snapshot_id=25, + parent_snapshot_id=parent_snapshot_id, + sequence_number=0, ) as writer: writer.add_manifests(demo_manifest_list) new_manifest_list = list(read_manifest_list(io.new_input(path))) - expected_metadata = {"snapshot-id": "25", "parent-snapshot-id": "19", "format-version": str(format_version)} + if parent_snapshot_id: + expected_metadata = {"snapshot-id": "25", "parent-snapshot-id": "19", "format-version": str(format_version)} + else: + expected_metadata = {"snapshot-id": "25", "parent-snapshot-id": "null", "format-version": str(format_version)} + if format_version == 2: expected_metadata["sequence-number"] = "0" _verify_metadata_with_fastavro(path, expected_metadata)