From b5d77f11beca2e74cf38b26bd500c36d1f7a9666 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Mon, 11 Jan 2021 16:25:17 -0800 Subject: [PATCH 1/2] API: handle NaN as min/max stats in evaluators --- .../InclusiveMetricsEvaluator.java | 44 +- .../expressions/StrictMetricsEvaluator.java | 52 ++- .../TestMetricsEvaluatorsNaNHandling.java | 405 ++++++++++++++++++ 3 files changed, 483 insertions(+), 18 deletions(-) create mode 100644 api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index e452d27287a8..d525495755bf 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -33,6 +33,7 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.NaNUtil; import static org.apache.iceberg.expressions.Expressions.rewriteNot; @@ -184,7 +185,7 @@ public Boolean notNaN(BoundReference ref) { public Boolean lt(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -192,7 +193,12 @@ public Boolean lt(BoundReference ref, Literal lit) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); - if (cmp >= 0) { + + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may skip including a file that contains matching data. + if (cmp >= 0 && !NaNUtil.isNaN(lower)) { return ROWS_CANNOT_MATCH; } } @@ -204,7 +210,7 @@ public Boolean lt(BoundReference ref, Literal lit) { public Boolean ltEq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -212,7 +218,12 @@ public Boolean ltEq(BoundReference ref, Literal lit) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); - if (cmp > 0) { + + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may skip including a file that contains matching data. + if (cmp > 0 && !NaNUtil.isNaN(lower)) { return ROWS_CANNOT_MATCH; } } @@ -224,7 +235,7 @@ public Boolean ltEq(BoundReference ref, Literal lit) { public Boolean gt(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -244,7 +255,7 @@ public Boolean gt(BoundReference ref, Literal lit) { public Boolean gtEq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -264,13 +275,21 @@ public Boolean gtEq(BoundReference ref, Literal lit) { public Boolean eq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may skip including a file that contains matching data. + if (NaNUtil.isNaN(lower)) { + return ROWS_MIGHT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; @@ -300,7 +319,7 @@ public Boolean notEq(BoundReference ref, Literal lit) { public Boolean in(BoundReference ref, Set literalSet) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -313,6 +332,15 @@ public Boolean in(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may skip including a file that contains matching data. + if (NaNUtil.isNaN(lower)) { + return ROWS_MIGHT_MATCH; + } + literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index d46a8216b1ec..461237269fbb 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.NaNUtil; import static org.apache.iceberg.expressions.Expressions.rewriteNot; @@ -179,7 +180,7 @@ public Boolean lt(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -202,7 +203,7 @@ public Boolean ltEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -225,7 +226,7 @@ public Boolean gt(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -233,7 +234,12 @@ public Boolean gt(BoundReference ref, Literal lit) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); - if (cmp > 0) { + + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may include a file that contains rows that don't match. + if (cmp > 0 && !NaNUtil.isNaN(lower)) { return ROWS_MUST_MATCH; } } @@ -248,7 +254,7 @@ public Boolean gtEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -256,7 +262,12 @@ public Boolean gtEq(BoundReference ref, Literal lit) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); int cmp = lit.comparator().compare(lower, lit.value()); - if (cmp >= 0) { + + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Without this NaN check below, we may include a file that contains rows that don't match. + if (cmp >= 0 && !NaNUtil.isNaN(lower)) { return ROWS_MUST_MATCH; } } @@ -271,7 +282,7 @@ public Boolean eq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -304,13 +315,21 @@ public Boolean notEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_MUST_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Thus we don't have visibility into the stats when lower bound is NaN. + if (NaNUtil.isNaN(lower)) { + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_MUST_MATCH; @@ -335,7 +354,7 @@ public Boolean in(BoundReference ref, Set literalSet) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -371,7 +390,7 @@ public Boolean notIn(BoundReference ref, Set literalSet) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_MUST_MATCH; } @@ -380,6 +399,14 @@ public Boolean notIn(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); + // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, + // if the first value in a file is NaN, metrics of this file will report NaN for both upper and + // lower bound despite that the column could contain non-NaN data. + // Thus we don't have visibility into the stats when lower bound is NaN. + if (NaNUtil.isNaN(lower)) { + return ROWS_MIGHT_NOT_MATCH; + } + literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows must match (notIn). return ROWS_MUST_MATCH; @@ -406,6 +433,11 @@ private boolean canContainNulls(Integer id) { return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0); } + private boolean canContainNaNs(Integer id) { + // nan counts might be null for early version writers when nan counters are not populated. + return nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) > 0; + } + private boolean containsNullsOnly(Integer id) { return valueCounts != null && valueCounts.containsKey(id) && nullCounts != null && nullCounts.containsKey(id) && diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java new file mode 100644 index 000000000000..bc8747b30ece --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.expressions; + +import java.nio.ByteBuffer; +import java.util.Set; +import java.util.function.BiFunction; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.types.Conversions.toByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +/** + * This test class ensures that metrics evaluators could handle NaN as upper/lower bounds correctly. + */ +public class TestMetricsEvaluatorsNaNHandling { + private static final Schema SCHEMA = new Schema( + required(1, "all_nan", Types.DoubleType.get()), + required(2, "max_nan", Types.DoubleType.get()), + optional(3, "min_max_nan", Types.FloatType.get()), + required(4, "all_nan_null_bounds", Types.DoubleType.get()), + optional(5, "some_nan_correct_bounds", Types.FloatType.get()) + ); + + private static final DataFile FILE = new TestHelpers.TestDataFile("file.avro", TestHelpers.Row.of(), 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(1, 10L) + .put(2, 10L) + .put(3, 10L) + .put(4, 10L) + .put(5, 10L) + .build(), + // null value counts + ImmutableMap.builder() + .put(1, 0L) + .put(2, 0L) + .put(3, 0L) + .put(4, 0L) + .put(5, 0L) + .build(), + // nan value counts + ImmutableMap.builder() + .put(1, 10L) + .put(4, 10L) + .put(5, 5L) + .build(), + // lower bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), 7D)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 7F)) + .build(), + // upper bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 22F)) + .build()); + + private static final Set> LESS_THAN_EXPRESSIONS = + ImmutableSet.of(Expressions::lessThan, Expressions::lessThanOrEqual); + + private static final Set> GREATER_THAN_EXPRESSIONS = + ImmutableSet.of(Expressions::greaterThan, Expressions::greaterThanOrEqual); + + @Test + public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() { + for (BiFunction func : LESS_THAN_EXPRESSIONS) { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: 10 is larger than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 larger than lower bound", shouldRead); + } + } + + @Test + public void testInclusiveMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { + for (BiFunction func : GREATER_THAN_EXPRESSIONS) { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: upper bound is larger than 1", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: upper bound is larger than 10", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than upper bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 is smaller than upper bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); + } + } + + @Test + public void testInclusiveMetricsEvaluatorEquals() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: 10 is within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 is within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorNotEquals() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorIn() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("all_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: 10 and 30 are greater than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("min_max_nan", 1F, 10F, 30F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + Assert.assertTrue("Should match: 10 within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertFalse("Should not match: 1 not within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 30F)).eval(FILE); + Assert.assertFalse("Should not match: 30 not within bounds", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorNotIn() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() { + for (BiFunction func : LESS_THAN_EXPRESSIONS) { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is less than upper bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertFalse("Should not match: nan value exists", shouldRead); + } + } + + @Test + public void testStrictMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { + for (BiFunction func : GREATER_THAN_EXPRESSIONS) { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is larger than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: nan value exists", shouldRead); + } + } + + @Test + public void testStrictMetricsEvaluatorNotEquals() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: 30 is greater than upper bound", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorEquals() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorNotIn() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: 10 and 30 are greater than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is less than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("min_max_nan", 1F, 10F, 30F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + Assert.assertFalse("Should not match: 10 within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 not within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30D)).eval(FILE); + Assert.assertTrue("Should match: 30 not within bounds", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorIn() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.in("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); + } +} From f3e6f4e1945915c4d95043da6e7442bdf3b8c4f4 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 19 Jan 2021 16:32:57 -0800 Subject: [PATCH 2/2] update comments and tests --- .../InclusiveMetricsEvaluator.java | 39 +++++++++---------- .../expressions/StrictMetricsEvaluator.java | 39 +++++++++---------- .../TestMetricsEvaluatorsNaNHandling.java | 8 +--- 3 files changed, 40 insertions(+), 46 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index d525495755bf..a6c3c65e9830 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -45,6 +45,11 @@ * Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain matching * rows and false if the file cannot contain matching rows. Files may be skipped if and only if the * return value of {@code eval} is false. + *

+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first + * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite + * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary + * in order to not skip files that may contain matching data. */ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; @@ -192,13 +197,13 @@ public Boolean lt(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); - int cmp = lit.comparator().compare(lower, lit.value()); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may skip including a file that contains matching data. - if (cmp >= 0 && !NaNUtil.isNaN(lower)) { + int cmp = lit.comparator().compare(lower, lit.value()); + if (cmp >= 0) { return ROWS_CANNOT_MATCH; } } @@ -217,13 +222,13 @@ public Boolean ltEq(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); - int cmp = lit.comparator().compare(lower, lit.value()); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may skip including a file that contains matching data. - if (cmp > 0 && !NaNUtil.isNaN(lower)) { + int cmp = lit.comparator().compare(lower, lit.value()); + if (cmp > 0) { return ROWS_CANNOT_MATCH; } } @@ -282,11 +287,8 @@ public Boolean eq(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may skip including a file that contains matching data. if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH; } @@ -333,11 +335,8 @@ public Boolean in(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may skip including a file that contains matching data. if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH; } diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index 461237269fbb..6ea293a0b562 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -45,6 +45,11 @@ *

* Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file must * contain matching rows and false if the file may contain rows that do not match. + *

+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first + * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite + * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary + * in order to not include files that may contain rows that don't match. */ public class StrictMetricsEvaluator { private final Schema schema; @@ -233,13 +238,13 @@ public Boolean gt(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); - int cmp = lit.comparator().compare(lower, lit.value()); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may include a file that contains rows that don't match. - if (cmp > 0 && !NaNUtil.isNaN(lower)) { + int cmp = lit.comparator().compare(lower, lit.value()); + if (cmp > 0) { return ROWS_MUST_MATCH; } } @@ -261,13 +266,13 @@ public Boolean gtEq(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); - int cmp = lit.comparator().compare(lower, lit.value()); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Without this NaN check below, we may include a file that contains rows that don't match. - if (cmp >= 0 && !NaNUtil.isNaN(lower)) { + int cmp = lit.comparator().compare(lower, lit.value()); + if (cmp >= 0) { return ROWS_MUST_MATCH; } } @@ -322,11 +327,8 @@ public Boolean notEq(BoundReference ref, Literal lit) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Thus we don't have visibility into the stats when lower bound is NaN. if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. return ROWS_MIGHT_NOT_MATCH; } @@ -399,11 +401,8 @@ public Boolean notIn(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); - // Due to the comparison implementation of ORC stats, for float/double columns in ORC files, - // if the first value in a file is NaN, metrics of this file will report NaN for both upper and - // lower bound despite that the column could contain non-NaN data. - // Thus we don't have visibility into the stats when lower bound is NaN. if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java index bc8747b30ece..415f36126a3b 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java @@ -222,12 +222,8 @@ public void testInclusiveMetricsEvaluatorIn() { Assert.assertTrue("Should match: 10 within bounds", shouldRead); shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE); + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 30F)).eval(FILE); Assert.assertFalse("Should not match: 1 not within bounds", shouldRead); - - shouldRead = new InclusiveMetricsEvaluator( - SCHEMA, Expressions.in("some_nan_correct_bounds", 30F)).eval(FILE); - Assert.assertFalse("Should not match: 30 not within bounds", shouldRead); } @Test @@ -266,7 +262,7 @@ public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() { shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); - shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30F)).eval(FILE); Assert.assertFalse("Should not match: nan value exists", shouldRead); } }