From 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Mon, 23 Dec 2024 18:25:20 -0500
Subject: [PATCH] Add `make_name_compatible` suggestion so test passes

---
 tests/integration/test_partitioning_key.py | 62 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index a43cf95c4..57a0b5243 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -18,7 +18,7 @@
 import uuid
 from datetime import date, datetime, timedelta, timezone
 from decimal import Decimal
-from typing import Any, List
+from typing import Any, Callable, List, Optional
 
 import pytest
 from pyspark.sql import SparkSession
@@ -78,7 +78,7 @@
 
 
 @pytest.mark.parametrize(
-    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
+    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
     [
         # # Identity Transform
         (
@@ -99,6 +99,7 @@
             VALUES
             (false, 'Boolean field set to false');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -118,6 +119,7 @@
             VALUES
             ('sample_string', 'Another string value')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -137,6 +139,7 @@
             VALUES
             (42, 'Associated string value for int 42')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -156,6 +159,7 @@
             VALUES
             (1234567890123456789, 'Associated string value for long 1234567890123456789')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -179,6 +183,7 @@
             # VALUES
             # (3.14, 'Associated string value for float 3.14')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -202,6 +207,7 @@
             # VALUES
             # (6.282, 'Associated string value for double 6.282')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -221,6 +227,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -240,6 +247,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -264,6 +272,7 @@
             # VALUES
             # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -288,6 +297,7 @@
             # VALUES
             # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
             # """
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -307,6 +317,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -326,6 +337,7 @@
             VALUES
             ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -345,6 +357,7 @@
             VALUES
             (CAST('example' AS BINARY), 'Associated string value for binary `example`')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -364,6 +377,7 @@
             VALUES
             (123.45, 'Associated string value for decimal 123.45')
             """,
+            None,
         ),
         # # Year Month Day Hour Transform
         # Month Transform
@@ -385,6 +399,7 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -404,6 +419,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -423,6 +439,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # Year Transform
         (
@@ -443,6 +460,7 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -462,6 +480,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -481,6 +500,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # # Day Transform
         (
@@ -501,6 +521,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -520,6 +541,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -539,6 +561,7 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
+            None,
         ),
         # Hour Transform
         (
@@ -559,6 +582,7 @@
                 VALUES
                 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
                 """,
+            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -578,6 +602,7 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
+            None,
         ),
         # Truncate Transform
         (
@@ -598,6 +623,7 @@
                 VALUES
                 (12345, 'Sample data for int');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -617,6 +643,7 @@
             VALUES
             (4294967297, 'Sample data for long');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -636,6 +663,7 @@
             VALUES
             ('abcdefg', 'Another sample for string');
             """,
+            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -655,6 +683,7 @@
             VALUES
             (678.90, 'Associated string value for decimal 678.90')
             """,
+            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -674,6 +703,7 @@
                 VALUES
                 (binary('HELLOICEBERG'), 'Sample data for binary');
             """,
+            None,
         ),
         # Bucket Transform
         (
@@ -694,6 +724,7 @@
             VALUES
             (10, 'Integer with value 10');
             """,
+            None,
         ),
         # Test multiple field combinations could generate the Partition record and hive partition path correctly
         (
@@ -722,30 +753,27 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
+            None,
         ),
         # Test that special characters are URL-encoded
         (
-            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
             ["special string"],
-            Record(**{"special#string#field": "special string"}),  # type: ignore
-            "special%23string%23field=special+string",
-            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
-            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
-            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
-            # None,
-            # None,
+            Record(**{"special#string+field": "special string"}),  # type: ignore
+            "special%23string%2Bfield=special+string",
             f"""CREATE TABLE {identifier} (
-                `special#string#field` string
+                `special#string+field` string
             )
             USING iceberg
             PARTITIONED BY (
-                identity(`special#string#field`)
+                identity(`special#string+field`)
             )
             """,
             f"""INSERT INTO {identifier}
             VALUES
             ('special string')
             """,
+            lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
         ),
     ],
 )
@@ -759,6 +787,7 @@ def test_partition_key(
     expected_hive_partition_path_slice: str,
     spark_create_table_sql_for_justification: str,
     spark_data_insert_sql_for_justification: str,
+    make_compatible_name: Optional[Callable[[str], str]],
 ) -> None:
     partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
     spec = PartitionSpec(*partition_fields)
@@ -793,5 +822,12 @@ def test_partition_key(
         spark_path_for_justification = (
             snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
         )
-        assert spark_partition_for_justification == expected_partition_record
+        # Special characters in partition value are sanitized when written to the data file's partition field
+        # Use `make_compatible_name` to match the sanitize behavior
+        sanitized_record = (
+            Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
+            if make_compatible_name
+            else expected_partition_record
+        )
+        assert spark_partition_for_justification == sanitized_record
         assert expected_hive_partition_path_slice in spark_path_for_justification