From 78c73d13ef6ba33b9876665d5260bda666f6d2a9 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 21 Dec 2024 12:33:35 -0800 Subject: [PATCH 1/4] update pyspark java iceberb library to 1.6.0 --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 89af22896..fb9d34a56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2242,7 +2242,7 @@ def spark() -> "SparkSession": spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) scala_version = "2.12" - iceberg_version = "1.4.3" + iceberg_version = "1.6.0" os.environ["PYSPARK_SUBMIT_ARGS"] = ( f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version}," From 0ea945d12f04b38af93c6a0a1338dd3967d943aa Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 21 Dec 2024 13:00:15 -0800 Subject: [PATCH 2/4] fix test --- tests/integration/test_deletes.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index affc480f0..f2417bde2 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -237,9 +237,7 @@ def test_delete_partitioned_table_positional_deletes(spark: SparkSession, sessio # Will rewrite a data file without the positional delete tbl.delete(EqualTo("number", 40)) - # One positional delete has been added, but an OVERWRITE status is set - # https://github.com/apache/iceberg/issues/10122 - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "overwrite", "overwrite"] + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "delete", "overwrite"] assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10], "number": [20]} @@ -410,8 +408,6 @@ def test_overwrite_partitioned_table(spark: SparkSession, session_catalog: RestC # Will rewrite a data file without the positional delete tbl.overwrite(arrow_tbl, "number_partitioned == 10") - # One positional delete has been added, but an OVERWRITE status is set - # https://github.com/apache/iceberg/issues/10122 assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "delete", "append"] assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10, 10, 20], "number": [4, 5, 3]} @@ -461,13 +457,11 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio # Will rewrite a data file without a positional delete tbl.delete(EqualTo("number", 201)) - # One positional delete has been added, but an OVERWRITE status is set - # https://github.com/apache/iceberg/issues/10122 snapshots = tbl.snapshots() assert len(snapshots) == 3 # Snapshots produced by Spark - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ["append", "overwrite"] + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ["append", "delete"] # Will rewrite one parquet file assert snapshots[2].summary == Summary( From d5998c61b44253372b26ff8be4fa070fc1b0d633 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 22 Dec 2024 10:46:41 -0800 Subject: [PATCH 3/4] add reminder --- dev/Dockerfile | 1 + tests/conftest.py | 1 + 2 files changed, 2 insertions(+) diff --git a/dev/Dockerfile b/dev/Dockerfile index d4346bf75..fe0297bd1 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -36,6 +36,7 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events WORKDIR ${SPARK_HOME} +# Remember to also update `tests/conftest`'s spark setting ENV SPARK_VERSION=3.5.3 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ENV ICEBERG_VERSION=1.6.0 diff --git a/tests/conftest.py b/tests/conftest.py index fb9d34a56..22329b388 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2240,6 +2240,7 @@ def spark() -> "SparkSession": from pyspark.sql import SparkSession + # Remember to also update `dev/Dockerfile` spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2]) scala_version = "2.12" iceberg_version = "1.6.0" From c5b95d0377b1feb6e3b1cf8200a6e56ed7133f9d Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 22 Dec 2024 12:41:51 -0800 Subject: [PATCH 4/4] make link --- dev/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/Dockerfile b/dev/Dockerfile index fe0297bd1..1cc70beda 100644 --- a/dev/Dockerfile +++ b/dev/Dockerfile @@ -36,7 +36,7 @@ ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events WORKDIR ${SPARK_HOME} -# Remember to also update `tests/conftest`'s spark setting +# Remember to also update `tests/conftest`'s spark setting ENV SPARK_VERSION=3.5.3 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 ENV ICEBERG_VERSION=1.6.0