From 5382abe75b54a4ac61c3cf30c71b7901e1cd7474 Mon Sep 17 00:00:00 2001 From: Jennifer Baldwin Date: Mon, 9 Dec 2024 10:30:42 -0500 Subject: [PATCH 1/4] OTF-2326 push updates for greater than --- .../InclusiveMetricsEvaluator.java | 27 ------------------- .../expressions/ManifestEvaluator.java | 8 +++--- .../parquet/ParquetMetricsRowGroupFilter.java | 8 ------ 3 files changed, 4 insertions(+), 39 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index 105e32274219..e7b7693c1df3 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -331,10 +331,6 @@ public Boolean gt(BoundReference ref, BoundReference ref2) { return ROWS_MIGHT_MATCH; } - if (checkUpperBounds(ref, ref2, id, id2, cmp -> cmp <= 0)) { - return ROWS_CANNOT_MATCH; - } - if (checkUpperToLowerBounds(ref, ref2, id, id2, cmp -> cmp <= 0)) { return ROWS_CANNOT_MATCH; } @@ -378,10 +374,6 @@ public Boolean gtEq(BoundReference ref, BoundReference ref2) { return ROWS_MIGHT_MATCH; } - if (checkUpperBounds(ref, ref2, id, id2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } - if (checkUpperToLowerBounds(ref, ref2, id, id2, cmp -> cmp < 0)) { return ROWS_CANNOT_MATCH; } @@ -464,25 +456,6 @@ public Boolean notEq(BoundReference ref, BoundReference ref2) { return ROWS_MIGHT_MATCH; } - private boolean checkUpperBounds( - BoundReference ref, - BoundReference ref2, - Integer id, - Integer id2, - java.util.function.Predicate compare) { - if (upperBounds != null && upperBounds.containsKey(id) && upperBounds.containsKey(id2)) { - T upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); - T upper2 = Conversions.fromByteBuffer(ref2.type(), upperBounds.get(id2)); - - Comparator comparator = Comparators.forType(ref.type().asPrimitiveType()); - int cmp = comparator.compare(upper, upper2); - if (compare.test(cmp)) { - return true; - } - } - return false; - } - private boolean checkLowerToUpperBounds( BoundReference ref, BoundReference ref2, diff --git a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java index d6931846476a..ab62a9fb5227 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java @@ -329,16 +329,16 @@ public Boolean gtEq(BoundReference ref, BoundReference ref2) { int pos = Accessors.toPosition(ref.accessor()); int pos2 = Accessors.toPosition(ref2.accessor()); ByteBuffer upperBound = stats.get(pos).upperBound(); - ByteBuffer upperBound2 = stats.get(pos2).upperBound(); - if (upperBound == null || upperBound2 == null) { + ByteBuffer lowerBound = stats.get(pos2).lowerBound(); + if (upperBound == null || lowerBound == null) { return ROWS_CANNOT_MATCH; // values are all null } T upper = Conversions.fromByteBuffer(ref.type(), upperBound); - T upper2 = Conversions.fromByteBuffer(ref2.type(), upperBound2); + T lower = Conversions.fromByteBuffer(ref2.type(), lowerBound); Comparator comparator = Comparators.forType(ref.type().asPrimitiveType()); - int cmp = comparator.compare(upper, upper2); + int cmp = comparator.compare(upper, lower); if (cmp < 0) { return ROWS_CANNOT_MATCH; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index cec57de04362..01c5972d7bc3 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -385,10 +385,6 @@ public Boolean gt(BoundReference ref, BoundReference ref2) { return ROWS_MIGHT_MATCH; } - if (compareUpperStats(ref, id, id2, colStats, colStats2, cmp -> cmp <= 0)) { - return ROWS_CANNOT_MATCH; - } - if (compareUpperToLowerStats(ref, id, id2, colStats, colStats2, cmp -> cmp <= 0)) { return ROWS_CANNOT_MATCH; } @@ -454,10 +450,6 @@ public Boolean gtEq(BoundReference ref, BoundReference ref2) { return ROWS_MIGHT_MATCH; } - if (compareUpperStats(ref, id, id2, colStats, colStats2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } - if (compareUpperToLowerStats(ref, id, id2, colStats, colStats2, cmp -> cmp < 0)) { return ROWS_CANNOT_MATCH; } From 644ae9d6c4f279058feddc750f778f085a5418b3 Mon Sep 17 00:00:00 2001 From: Jennifer Baldwin Date: Wed, 11 Dec 2024 15:05:44 -0500 Subject: [PATCH 2/4] OTF-2326 fix issues with equals operator on data pruning --- .../InclusiveMetricsEvaluator.java | 28 ----------- .../parquet/ParquetMetricsRowGroupFilter.java | 49 ------------------- 2 files changed, 77 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index e7b7693c1df3..4cd118be9849 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -284,14 +284,6 @@ public Boolean ltEq(BoundReference ref, BoundReference ref2) { return ROWS_CANNOT_MATCH; } - if (ref.type().typeId() != ref2.type().typeId()) { - return ROWS_MIGHT_MATCH; - } - - if (checkLowerToUpperBounds(ref, ref2, id, id2, cmp -> cmp > 0)) { - return ROWS_CANNOT_MATCH; - } - return ROWS_MIGHT_MATCH; } @@ -370,14 +362,6 @@ public Boolean gtEq(BoundReference ref, BoundReference ref2) { return ROWS_CANNOT_MATCH; } - if (ref.type().typeId() != ref2.type().typeId()) { - return ROWS_MIGHT_MATCH; - } - - if (checkUpperToLowerBounds(ref, ref2, id, id2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } - return ROWS_MIGHT_MATCH; } @@ -427,18 +411,6 @@ public Boolean eq(BoundReference ref, BoundReference ref2) { return ROWS_CANNOT_MATCH; } - if (ref.type().typeId() != ref2.type().typeId()) { - return ROWS_MIGHT_MATCH; - } - - if (checkLowerToUpperBounds(ref, ref2, id, id2, cmp -> cmp > 0)) { - return ROWS_CANNOT_MATCH; - } - - if (checkUpperToLowerBounds(ref, ref2, id, id2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } - return ROWS_MIGHT_MATCH; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 01c5972d7bc3..4abed33116dd 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -319,10 +319,6 @@ public Boolean ltEq(BoundReference ref, BoundReference ref2) { if (minMaxUndefined(colStats) || minMaxUndefined(colStats2)) { return ROWS_MIGHT_MATCH; } - - if (compareLowerToUpperStats(ref, id, id2, colStats, colStats2, cmp -> cmp > 0)) { - return ROWS_CANNOT_MATCH; - } } return ROWS_MIGHT_MATCH; @@ -449,10 +445,6 @@ public Boolean gtEq(BoundReference ref, BoundReference ref2) { if (minMaxUndefined(colStats) || minMaxUndefined(colStats2)) { return ROWS_MIGHT_MATCH; } - - if (compareUpperToLowerStats(ref, id, id2, colStats, colStats2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } } return ROWS_MIGHT_MATCH; @@ -557,52 +549,11 @@ private Boolean compareStats( if (minMaxUndefined(colStats) || minMaxUndefined(colStats2)) { return ROWS_MIGHT_MATCH; } - - if (compareLowerStats(ref, id, id2, colStats, colStats2, cmp -> cmp > 0)) { - return ROWS_CANNOT_MATCH; - } - if (compareUpperStats(ref, id, id2, colStats, colStats2, cmp -> cmp < 0)) { - return ROWS_CANNOT_MATCH; - } } } return ROWS_MIGHT_MATCH; } - private boolean compareUpperStats( - BoundReference ref, - int id, - int id2, - Statistics colStats, - Statistics colStats2, - java.util.function.Predicate compare) { - int cmp; - - T upper = max(colStats, id); - T upper2 = max(colStats2, id2); - cmp = ref.comparator().compare(upper, upper2); - if (compare.test(cmp)) { - return true; - } - return false; - } - - private boolean compareLowerStats( - BoundReference ref, - int id, - int id2, - Statistics colStats, - Statistics colStats2, - java.util.function.Predicate compare) { - T lower = min(colStats, id); - T lower2 = min(colStats2, id2); - int cmp = ref.comparator().compare(lower, lower2); - if (compare.test(cmp)) { - return true; - } - return false; - } - private boolean compareLowerToUpperStats( BoundReference ref, int id, From 8539249d6d5097bef0919fd7dde2d7e61e26aa68 Mon Sep 17 00:00:00 2001 From: Jennifer Baldwin Date: Thu, 12 Dec 2024 06:27:42 -0500 Subject: [PATCH 3/4] OTF-2326 fix test --- .../expressions/TestInclusiveMetricsEvaluator.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 663bc221cd56..55a425cbf357 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -714,15 +714,15 @@ public void testRefCompareIntegerEq() { new InclusiveMetricsEvaluator(SCHEMA, termPredicate(Expression.Operation.EQ, "id", "id2")) .eval(FILE_2); assertThat(shouldRead) - .as("Should not read: id range (30,40) can not be equal to id2 range (50,80)") - .isFalse(); + .as("Should not read: id range (30,40) can be equal to id2 range (50,80)") + .isTrue(); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, termPredicate(Expression.Operation.EQ, "id", "id2")) .eval(FILE_3); assertThat(shouldRead) - .as("Should not read: id range (5,25) can not be equal to id2 range (30,40)") - .isFalse(); + .as("Should not read: id range (5,25) can be equal to id2 range (30,40)") + .isTrue(); shouldRead = new InclusiveMetricsEvaluator(SCHEMA, termPredicate(Expression.Operation.EQ, "id", "id2")) From 45e04d808af824bf9023d585df3f43664c6fe97a Mon Sep 17 00:00:00 2001 From: Jennifer Baldwin Date: Thu, 12 Dec 2024 08:35:28 -0500 Subject: [PATCH 4/4] OTF-2326 fix tests --- .../expressions/TestInclusiveMetricsEvaluator.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java index 55a425cbf357..6d19484233ec 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java @@ -553,8 +553,8 @@ SCHEMA, termPredicate(Expression.Operation.LT_EQ, "id", "id2")) SCHEMA, termPredicate(Expression.Operation.LT_EQ, "id", "id2")) .eval(FILE_3); assertThat(shouldRead) - .as("Should not read: id range lower bound (30) is not below upper bound id range (25)") - .isFalse(); + .as("Should not read: id range lower bound (30) can be equal to upper bound range (25)") + .isTrue(); shouldRead = new InclusiveMetricsEvaluator( @@ -655,9 +655,8 @@ SCHEMA, termPredicate(Expression.Operation.GT_EQ, "id", "id2")) SCHEMA, termPredicate(Expression.Operation.GT_EQ, "id", "id2")) .eval(FILE_2); assertThat(shouldRead) - .as( - "Should not read: id range upper bound (40) is not greater than upper bound id2 range (80)") - .isFalse(); + .as("Should read: id range upper bound (40) can be equal to upper range (80)") + .isTrue(); shouldRead = new InclusiveMetricsEvaluator(