From b5d77f11beca2e74cf38b26bd500c36d1f7a9666 Mon Sep 17 00:00:00 2001
From: Yan Yan <yanya@amazon.com>
Date: Mon, 11 Jan 2021 16:25:17 -0800
Subject: [PATCH 1/2] API: handle NaN as min/max stats in evaluators

---
 .../InclusiveMetricsEvaluator.java            |  44 +-
 .../expressions/StrictMetricsEvaluator.java   |  52 ++-
 .../TestMetricsEvaluatorsNaNHandling.java     | 405 ++++++++++++++++++
 3 files changed, 483 insertions(+), 18 deletions(-)
 create mode 100644 api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
index e452d27287a8..d525495755bf 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -33,6 +33,7 @@
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.BinaryUtil;
+import org.apache.iceberg.util.NaNUtil;
 
 import static org.apache.iceberg.expressions.Expressions.rewriteNot;
 
@@ -184,7 +185,7 @@ public <T> Boolean notNaN(BoundReference<T> ref) {
     public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -192,7 +193,12 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp >= 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -204,7 +210,7 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -212,7 +218,12 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp > 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -224,7 +235,7 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -244,7 +255,7 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -264,13 +275,21 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_MATCH;
+        }
+
         int cmp = lit.comparator().compare(lower, lit.value());
         if (cmp > 0) {
           return ROWS_CANNOT_MATCH;
@@ -300,7 +319,7 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -313,6 +332,15 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_MATCH;
+        }
+
         literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
         if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match.
           return ROWS_CANNOT_MATCH;
diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
index d46a8216b1ec..461237269fbb 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
@@ -32,6 +32,7 @@
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.types.Types.StructType;
+import org.apache.iceberg.util.NaNUtil;
 
 import static org.apache.iceberg.expressions.Expressions.rewriteNot;
 
@@ -179,7 +180,7 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -202,7 +203,7 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -225,7 +226,7 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -233,7 +234,12 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp > 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may include a file that contains rows that don't match.
+        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -248,7 +254,7 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -256,7 +262,12 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp >= 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may include a file that contains rows that don't match.
+        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -271,7 +282,7 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -304,13 +315,21 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_MUST_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Thus we don't have visibility into the stats when lower bound is NaN.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_NOT_MATCH;
+        }
+
         int cmp = lit.comparator().compare(lower, lit.value());
         if (cmp > 0) {
           return ROWS_MUST_MATCH;
@@ -335,7 +354,7 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -371,7 +390,7 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_MUST_MATCH;
       }
 
@@ -380,6 +399,14 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Thus we don't have visibility into the stats when lower bound is NaN.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_NOT_MATCH;
+        }
+
         literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
         if (literals.isEmpty()) {  // if all values are less than lower bound, rows must match (notIn).
           return ROWS_MUST_MATCH;
@@ -406,6 +433,11 @@ private boolean canContainNulls(Integer id) {
       return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0);
     }
 
+    private boolean canContainNaNs(Integer id) {
+      // nan counts might be null for early version writers when nan counters are not populated.
+      return nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) > 0;
+    }
+
     private boolean containsNullsOnly(Integer id) {
       return valueCounts != null && valueCounts.containsKey(id) &&
           nullCounts != null && nullCounts.containsKey(id) &&
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
new file mode 100644
index 000000000000..bc8747b30ece
--- /dev/null
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.expressions;
+
+import java.nio.ByteBuffer;
+import java.util.Set;
+import java.util.function.BiFunction;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.TestHelpers;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
+import org.apache.iceberg.types.Types;
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.apache.iceberg.types.Conversions.toByteBuffer;
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.iceberg.types.Types.NestedField.required;
+
+/**
+ * This test class ensures that metrics evaluators could handle NaN as upper/lower bounds correctly.
+ */
+public class TestMetricsEvaluatorsNaNHandling {
+  private static final Schema SCHEMA = new Schema(
+      required(1, "all_nan", Types.DoubleType.get()),
+      required(2, "max_nan", Types.DoubleType.get()),
+      optional(3, "min_max_nan", Types.FloatType.get()),
+      required(4, "all_nan_null_bounds", Types.DoubleType.get()),
+      optional(5, "some_nan_correct_bounds", Types.FloatType.get())
+  );
+
+  private static final DataFile FILE = new TestHelpers.TestDataFile("file.avro", TestHelpers.Row.of(), 50,
+      // any value counts, including nulls
+      ImmutableMap.<Integer, Long>builder()
+          .put(1, 10L)
+          .put(2, 10L)
+          .put(3, 10L)
+          .put(4, 10L)
+          .put(5, 10L)
+          .build(),
+      // null value counts
+      ImmutableMap.<Integer, Long>builder()
+          .put(1, 0L)
+          .put(2, 0L)
+          .put(3, 0L)
+          .put(4, 0L)
+          .put(5, 0L)
+          .build(),
+      // nan value counts
+      ImmutableMap.<Integer, Long>builder()
+          .put(1, 10L)
+          .put(4, 10L)
+          .put(5, 5L)
+          .build(),
+      // lower bounds
+      ImmutableMap.<Integer, ByteBuffer>builder()
+          .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN))
+          .put(2, toByteBuffer(Types.DoubleType.get(), 7D))
+          .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN))
+          .put(5, toByteBuffer(Types.FloatType.get(), 7F))
+          .build(),
+      // upper bounds
+      ImmutableMap.<Integer, ByteBuffer>builder()
+          .put(1, toByteBuffer(Types.DoubleType.get(), Double.NaN))
+          .put(2, toByteBuffer(Types.DoubleType.get(), Double.NaN))
+          .put(3, toByteBuffer(Types.FloatType.get(), Float.NaN))
+          .put(5, toByteBuffer(Types.FloatType.get(), 22F))
+          .build());
+
+  private static final Set<BiFunction<String, Number, Expression>> LESS_THAN_EXPRESSIONS =
+      ImmutableSet.of(Expressions::lessThan, Expressions::lessThanOrEqual);
+
+  private static final Set<BiFunction<String, Number, Expression>> GREATER_THAN_EXPRESSIONS =
+      ImmutableSet.of(Expressions::greaterThan, Expressions::greaterThanOrEqual);
+
+  @Test
+  public void testInclusiveMetricsEvaluatorLessThanAndLessThanOrEqual() {
+    for (BiFunction<String, Number, Expression> func : LESS_THAN_EXPRESSIONS) {
+      boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE);
+      Assert.assertTrue("Should match: 10 is larger than lower bound", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE);
+      Assert.assertTrue("Should match: no visibility", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE);
+      Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE);
+      Assert.assertTrue("Should match: 10 larger than lower bound", shouldRead);
+    }
+  }
+
+  @Test
+  public void testInclusiveMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() {
+    for (BiFunction<String, Number, Expression> func : GREATER_THAN_EXPRESSIONS) {
+      boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE);
+      Assert.assertTrue("Should match: upper bound is larger than 1", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE);
+      Assert.assertTrue("Should match: upper bound is larger than 10", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE);
+      Assert.assertTrue("Should match: no visibility", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 1F)).eval(FILE);
+      Assert.assertTrue("Should match: 1 is smaller than upper bound", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE);
+      Assert.assertTrue("Should match: 10 is smaller than upper bound", shouldRead);
+
+      shouldRead = new InclusiveMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE);
+      Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead);
+    }
+  }
+
+  @Test
+  public void testInclusiveMetricsEvaluatorEquals() {
+    boolean shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE);
+    Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE);
+    Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE);
+    Assert.assertTrue("Should match: 10 is within bounds", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE);
+    Assert.assertTrue("Should match: no visibility", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE);
+    Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE);
+    Assert.assertFalse("Should not match: 1 is smaller than lower bound", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE);
+    Assert.assertTrue("Should match: 10 is within bounds", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertFalse("Should not match: 30 is greater than upper bound", shouldRead);
+  }
+
+  @Test
+  public void testInclusiveMetricsEvaluatorNotEquals() {
+    boolean shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertTrue("Should match: no visibility", shouldRead);
+  }
+
+  @Test
+  public void testInclusiveMetricsEvaluatorIn() {
+    boolean shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("all_nan", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("max_nan", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertTrue("Should match: 10 and 30 are greater than lower bound", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("min_max_nan", 1F, 10F, 30F)).eval(FILE);
+    Assert.assertTrue("Should match: no visibility", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE);
+    Assert.assertTrue("Should match: 10 within bounds", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE);
+    Assert.assertFalse("Should not match: 1 not within bounds", shouldRead);
+
+    shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 30F)).eval(FILE);
+    Assert.assertFalse("Should not match: 30 not within bounds", shouldRead);
+  }
+
+  @Test
+  public void testInclusiveMetricsEvaluatorNotIn() {
+    boolean shouldRead = new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("all_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("max_nan", 10D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("min_max_nan", 1F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 10F)).eval(FILE);
+    shouldRead = shouldRead & new InclusiveMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertTrue("Should match: no visibility", shouldRead);
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() {
+    for (BiFunction<String, Number, Expression> func : LESS_THAN_EXPRESSIONS) {
+      boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE);
+      Assert.assertFalse("Should not match: 10 is less than upper bound", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE);
+      Assert.assertFalse("Should not match: no visibility", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE);
+      Assert.assertFalse("Should not match: nan value exists", shouldRead);
+    }
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorGreaterThanAndGreaterThanOrEqual() {
+    for (BiFunction<String, Number, Expression> func : GREATER_THAN_EXPRESSIONS) {
+      boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 1D)).eval(FILE);
+      Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("max_nan", 10D)).eval(FILE);
+      Assert.assertFalse("Should not match: 10 is larger than lower bound", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("min_max_nan", 1F)).eval(FILE);
+      Assert.assertFalse("Should not match: no visibility", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE);
+      Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
+
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30)).eval(FILE);
+      Assert.assertFalse("Should not match: nan value exists", shouldRead);
+    }
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorNotEquals() {
+    boolean shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("all_nan", 1D)).eval(FILE);
+    Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 1D)).eval(FILE);
+    Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("max_nan", 10D)).eval(FILE);
+    Assert.assertFalse("Should not match: 10 is within bounds", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(SCHEMA, Expressions.notEqual("min_max_nan", 1F)).eval(FILE);
+    Assert.assertFalse("Should not match: no visibility", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("all_nan_null_bounds", 1D)).eval(FILE);
+    Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 1F)).eval(FILE);
+    Assert.assertTrue("Should match: 1 is smaller than lower bound", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 10F)).eval(FILE);
+    Assert.assertFalse("Should not match: 10 is within bounds", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notEqual("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertTrue("Should match: 30 is greater than upper bound", shouldRead);
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorEquals() {
+    boolean shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("all_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("max_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("max_nan", 10D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("min_max_nan", 1F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("all_nan_null_bounds", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 1F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 10F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead);
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorNotIn() {
+    boolean shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("all_nan", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("max_nan", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertFalse("Should not match: 10 and 30 are greater than lower bound", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("max_nan", 1D)).eval(FILE);
+    Assert.assertTrue("Should match: 1 is less than lower bound", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("min_max_nan", 1F, 10F, 30F)).eval(FILE);
+    Assert.assertFalse("Should not match: no visibility", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("all_nan_null_bounds", 1D, 10D, 30D)).eval(FILE);
+    Assert.assertTrue("Should match: all nan column doesn't contain number", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1F, 10F, 30F)).eval(FILE);
+    Assert.assertFalse("Should not match: 10 within bounds", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 1D)).eval(FILE);
+    Assert.assertTrue("Should match: 1 not within bounds", shouldRead);
+
+    shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.notIn("some_nan_correct_bounds", 30D)).eval(FILE);
+    Assert.assertTrue("Should match: 30 not within bounds", shouldRead);
+  }
+
+  @Test
+  public void testStrictMetricsEvaluatorIn() {
+    boolean shouldRead = new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("all_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("max_nan", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("max_nan", 10D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("min_max_nan", 1F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("all_nan_null_bounds", 1D)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 10F)).eval(FILE);
+    shouldRead = shouldRead | new StrictMetricsEvaluator(
+        SCHEMA, Expressions.equal("some_nan_correct_bounds", 30)).eval(FILE);
+    Assert.assertFalse("Should not match: bounds not equal to given value", shouldRead);
+  }
+}

From f3e6f4e1945915c4d95043da6e7442bdf3b8c4f4 Mon Sep 17 00:00:00 2001
From: Yan Yan <yanya@amazon.com>
Date: Tue, 19 Jan 2021 16:32:57 -0800
Subject: [PATCH 2/2] update comments and tests

---
 .../InclusiveMetricsEvaluator.java            | 39 +++++++++----------
 .../expressions/StrictMetricsEvaluator.java   | 39 +++++++++----------
 .../TestMetricsEvaluatorsNaNHandling.java     |  8 +---
 3 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
index d525495755bf..a6c3c65e9830 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -45,6 +45,11 @@
  * Files are passed to {@link #eval(ContentFile)}, which returns true if the file may contain matching
  * rows and false if the file cannot contain matching rows. Files may be skipped if and only if the
  * return value of {@code eval} is false.
+ * <p>
+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first
+ * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite
+ * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary
+ * in order to not skip files that may contain matching data.
  */
 public class InclusiveMetricsEvaluator {
   private static final int IN_PREDICATE_LIMIT = 200;
@@ -192,13 +197,13 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
-        int cmp = lit.comparator().compare(lower, lit.value());
+        if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
+          return ROWS_MIGHT_MATCH;
+        }
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may skip including a file that contains matching data.
-        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
+        int cmp = lit.comparator().compare(lower, lit.value());
+        if (cmp >= 0) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -217,13 +222,13 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
-        int cmp = lit.comparator().compare(lower, lit.value());
+        if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
+          return ROWS_MIGHT_MATCH;
+        }
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may skip including a file that contains matching data.
-        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
+        int cmp = lit.comparator().compare(lower, lit.value());
+        if (cmp > 0) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -282,11 +287,8 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may skip including a file that contains matching data.
         if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
           return ROWS_MIGHT_MATCH;
         }
 
@@ -333,11 +335,8 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may skip including a file that contains matching data.
         if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more.
           return ROWS_MIGHT_MATCH;
         }
 
diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
index 461237269fbb..6ea293a0b562 100644
--- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
+++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
@@ -45,6 +45,11 @@
  * <p>
  * Files are passed to {@link #eval(ContentFile)}, which returns true if all rows in the file must
  * contain matching rows and false if the file may contain rows that do not match.
+ * <p>
+ * Due to the comparison implementation of ORC stats, for float/double columns in ORC files, if the first
+ * value in a file is NaN, metrics of this file will report NaN for both upper and lower bound despite
+ * that the column could contain non-NaN data. Thus in some scenarios explicitly checks for NaN is necessary
+ * in order to not include files that may contain rows that don't match.
  */
 public class StrictMetricsEvaluator {
   private final Schema schema;
@@ -233,13 +238,13 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
-        int cmp = lit.comparator().compare(lower, lit.value());
+        if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
+          return ROWS_MIGHT_NOT_MATCH;
+        }
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may include a file that contains rows that don't match.
-        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
+        int cmp = lit.comparator().compare(lower, lit.value());
+        if (cmp > 0) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -261,13 +266,13 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
-        int cmp = lit.comparator().compare(lower, lit.value());
+        if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
+          return ROWS_MIGHT_NOT_MATCH;
+        }
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Without this NaN check below, we may include a file that contains rows that don't match.
-        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
+        int cmp = lit.comparator().compare(lower, lit.value());
+        if (cmp >= 0) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -322,11 +327,8 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Thus we don't have visibility into the stats when lower bound is NaN.
         if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
           return ROWS_MIGHT_NOT_MATCH;
         }
 
@@ -399,11 +401,8 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
-        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
-        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
-        // lower bound despite that the column could contain non-NaN data.
-        // Thus we don't have visibility into the stats when lower bound is NaN.
         if (NaNUtil.isNaN(lower)) {
+          // NaN indicates unreliable bounds. See the StrictMetricsEvaluator docs for more.
           return ROWS_MIGHT_NOT_MATCH;
         }
 
diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
index bc8747b30ece..415f36126a3b 100644
--- a/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
+++ b/api/src/test/java/org/apache/iceberg/expressions/TestMetricsEvaluatorsNaNHandling.java
@@ -222,12 +222,8 @@ public void testInclusiveMetricsEvaluatorIn() {
     Assert.assertTrue("Should match: 10 within bounds", shouldRead);
 
     shouldRead = new InclusiveMetricsEvaluator(
-        SCHEMA, Expressions.in("some_nan_correct_bounds", 1F)).eval(FILE);
+        SCHEMA, Expressions.in("some_nan_correct_bounds", 1F, 30F)).eval(FILE);
     Assert.assertFalse("Should not match: 1 not within bounds", shouldRead);
-
-    shouldRead = new InclusiveMetricsEvaluator(
-        SCHEMA, Expressions.in("some_nan_correct_bounds", 30F)).eval(FILE);
-    Assert.assertFalse("Should not match: 30 not within bounds", shouldRead);
   }
 
   @Test
@@ -266,7 +262,7 @@ public void testStrictMetricsEvaluatorLessThanAndLessThanOrEqual() {
       shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("all_nan_null_bounds", 1D)).eval(FILE);
       Assert.assertFalse("Should not match: all nan column doesn't contain number", shouldRead);
 
-      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 10F)).eval(FILE);
+      shouldRead = new StrictMetricsEvaluator(SCHEMA, func.apply("some_nan_correct_bounds", 30F)).eval(FILE);
       Assert.assertFalse("Should not match: nan value exists", shouldRead);
     }
   }