diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java index e452d27287a8..a6c3c65e9830 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java @@ -33,6 +33,7 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.NaNUtil; import static org.apache.iceberg.expressions.Expressions.rewriteNot; @@ -44,6 +45,11 @@ * Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain matching * rows and false if the file cannot contain matching rows. Files may be skipped if and only if the * return value of {@code eval} is false. + *

+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first + * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite + * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary + * in order to not skip files that may contain matching data. */ public class InclusiveMetricsEvaluator { private static final int IN_PREDICATE_LIMIT = 200; @@ -184,13 +190,18 @@ public Boolean notNaN(BoundReference ref) { public Boolean lt(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_CANNOT_MATCH; @@ -204,13 +215,18 @@ public Boolean lt(BoundReference ref, Literal lit) { public Boolean ltEq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; @@ -224,7 +240,7 @@ public Boolean ltEq(BoundReference ref, Literal lit) { public Boolean gt(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -244,7 +260,7 @@ public Boolean gt(BoundReference ref, Literal lit) { public Boolean gtEq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -264,13 +280,18 @@ public Boolean gtEq(BoundReference ref, Literal lit) { public Boolean eq(BoundReference ref, Literal lit) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_CANNOT_MATCH; @@ -300,7 +321,7 @@ public Boolean notEq(BoundReference ref, Literal lit) { public Boolean in(BoundReference ref, Set literalSet) { Integer id = ref.fieldId(); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_CANNOT_MATCH; } @@ -313,6 +334,12 @@ public Boolean in(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. + return ROWS_MIGHT_MATCH; + } + literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match. return ROWS_CANNOT_MATCH; diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index d46a8216b1ec..6ea293a0b562 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.NaNUtil; import static org.apache.iceberg.expressions.Expressions.rewriteNot; @@ -44,6 +45,11 @@ *

* Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file must * contain matching rows and false if the file may contain rows that do not match. + *

+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first + * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite + * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary + * in order to not include files that may contain rows that don't match. */ public class StrictMetricsEvaluator { private final Schema schema; @@ -179,7 +185,7 @@ public Boolean lt(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -202,7 +208,7 @@ public Boolean ltEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -225,13 +231,18 @@ public Boolean gt(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_MUST_MATCH; @@ -248,13 +259,18 @@ public Boolean gtEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp >= 0) { return ROWS_MUST_MATCH; @@ -271,7 +287,7 @@ public Boolean eq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -304,13 +320,18 @@ public Boolean notEq(BoundReference ref, Literal lit) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_MUST_MATCH; } if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } + int cmp = lit.comparator().compare(lower, lit.value()); if (cmp > 0) { return ROWS_MUST_MATCH; @@ -335,7 +356,7 @@ public Boolean in(BoundReference ref, Set literalSet) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (canContainNulls(id)) { + if (canContainNulls(id) || canContainNaNs(id)) { return ROWS_MIGHT_NOT_MATCH; } @@ -371,7 +392,7 @@ public Boolean notIn(BoundReference ref, Set literalSet) { Types.NestedField field = struct.field(id); Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id)); - if (containsNullsOnly(id)) { + if (containsNullsOnly(id) || containsNaNsOnly(id)) { return ROWS_MUST_MATCH; } @@ -380,6 +401,11 @@ public Boolean notIn(BoundReference ref, Set literalSet) { if (lowerBounds != null && lowerBounds.containsKey(id)) { T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id)); + if (NaNUtil.isNaN(lower)) { + // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more. + return ROWS_MIGHT_NOT_MATCH; + } + literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList()); if (literals.isEmpty()) { // if all values are less than lower bound, rows must match (notIn). return ROWS_MUST_MATCH; @@ -406,6 +432,11 @@ private boolean canContainNulls(Integer id) { return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0); } + private boolean canContainNaNs(Integer id) { + // nan counts might be null for early version writers when nan counters are not populated. + return nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) > 0; + } + private boolean containsNullsOnly(Integer id) { return valueCounts != null && valueCounts.containsKey(id) && nullCounts != null && nullCounts.containsKey(id) && diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java new file mode 100644 index 000000000000..415f36126a3b --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java @@ -0,0 +1,401 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.expressions; + +import java.nio.ByteBuffer; +import java.util.Set; +import java.util.function.BiFunction; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TestHelpers; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.types.Conversions.toByteBuffer; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +/** + * This test class ensures that metrics evaluators could handle NaN as upper/lower bounds correctly. + */ +public class TestMetricsEvaluatorsNaNHandling { + private static final Schema SCHEMA = new Schema( + required(1, "all_nan", Types.DoubleType.get()), + required(2, "max_nan", Types.DoubleType.get()), + optional(3, "min_max_nan", Types.FloatType.get()), + required(4, "all_nan_null_bounds", Types.DoubleType.get()), + optional(5, "some_nan_correct_bounds", Types.FloatType.get()) + ); + + private static final DataFile FILE = new TestHelpers.TestDataFile("file.avro", TestHelpers.Row.of(), 50, + // any value counts, including nulls + ImmutableMap.builder() + .put(1, 10L) + .put(2, 10L) + .put(3, 10L) + .put(4, 10L) + .put(5, 10L) + .build(), + // null value counts + ImmutableMap.builder() + .put(1, 0L) + .put(2, 0L) + .put(3, 0L) + .put(4, 0L) + .put(5, 0L) + .build(), + // nan value counts + ImmutableMap.builder() + .put(1, 10L) + .put(4, 10L) + .put(5, 5L) + .build(), + // lower bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), 7D)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 7F)) + .build(), + // upper bounds + ImmutableMap.builder() + .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(2, toByteBuffer(Types.DoubleType.get(), Double.NaN)) + .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN)) + .put(5, toByteBuffer(Types.FloatType.get(), 22F)) + .build()); + + private static final Set> LESS_THAN_EXPRESSIONS = + ImmutableSet.of(Expressions::lessThan, Expressions::lessThanOrEqual); + + private static final Set> GREATER_THAN_EXPRESSIONS = + ImmutableSet.of(Expressions::greaterThan, Expressions::greaterThanOrEqual); + + @Test + public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() { + for (BiFunction func : LESS_THAN_EXPRESSIONS) { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: 10 is larger than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 larger than lower bound", shouldRead); + } + } + + @Test + public void testInclusiveMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { + for (BiFunction func : GREATER_THAN_EXPRESSIONS) { + boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: upper bound is larger than 1", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: upper bound is larger than 10", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than upper bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 is smaller than upper bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); + } + } + + @Test + public void testInclusiveMetricsEvaluatorEquals() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + Assert.assertTrue("Should match: 10 is within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertTrue("Should match: 10 is within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorNotEquals() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorIn() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("all_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: 10 and 30 are greater than lower bound", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("min_max_nan", 1F, 10F, 30F)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + Assert.assertTrue("Should match: 10 within bounds", shouldRead); + + shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 30F)).eval(FILE); + Assert.assertFalse("Should not match: 1 not within bounds", shouldRead); + } + + @Test + public void testInclusiveMetricsEvaluatorNotIn() { + boolean shouldRead = new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead & new InclusiveMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: no visibility", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() { + for (BiFunction func : LESS_THAN_EXPRESSIONS) { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is less than upper bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30F)).eval(FILE); + Assert.assertFalse("Should not match: nan value exists", shouldRead); + } + } + + @Test + public void testStrictMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() { + for (BiFunction func : GREATER_THAN_EXPRESSIONS) { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is larger than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: nan value exists", shouldRead); + } + } + + @Test + public void testStrictMetricsEvaluatorNotEquals() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE); + Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE); + Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE); + Assert.assertFalse("Should not match: 10 is within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertTrue("Should match: 30 is greater than upper bound", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorEquals() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorNotIn() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D, 10D, 30D)).eval(FILE); + Assert.assertFalse("Should not match: 10 and 30 are greater than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 is less than lower bound", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("min_max_nan", 1F, 10F, 30F)).eval(FILE); + Assert.assertFalse("Should not match: no visibility", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE); + Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE); + Assert.assertFalse("Should not match: 10 within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1D)).eval(FILE); + Assert.assertTrue("Should match: 1 not within bounds", shouldRead); + + shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30D)).eval(FILE); + Assert.assertTrue("Should match: 30 not within bounds", shouldRead); + } + + @Test + public void testStrictMetricsEvaluatorIn() { + boolean shouldRead = new StrictMetricsEvaluator( + SCHEMA, Expressions.in("all_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("max_nan", 10D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("min_max_nan", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("all_nan_null_bounds", 1D)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.in("some_nan_correct_bounds", 10F)).eval(FILE); + shouldRead = shouldRead | new StrictMetricsEvaluator( + SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE); + Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead); + } +}