apache · rdblue · Jan 20, 2021 · Jan 12, 2021 · Jan 20, 2021 · jackye1995
diff --git a/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java
@@ -33,6 +33,7 @@
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.BinaryUtil;
+import org.apache.iceberg.util.NaNUtil;
 
 import static org.apache.iceberg.expressions.Expressions.rewriteNot;
 
@@ -184,15 +185,20 @@ public <T> Boolean notNaN(BoundReference<T> ref) {
     public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp >= 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -204,15 +210,20 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp > 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_CANNOT_MATCH;
         }
       }
@@ -224,7 +235,7 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -244,7 +255,7 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -264,13 +275,21 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_MATCH;
+        }
+
         int cmp = lit.comparator().compare(lower, lit.value());
         if (cmp > 0) {
           return ROWS_CANNOT_MATCH;
@@ -300,7 +319,7 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
     public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
       Integer id = ref.fieldId();
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_CANNOT_MATCH;
       }
 
@@ -313,6 +332,15 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id));
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may skip including a file that contains matching data.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_MATCH;
+        }
+
         literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
         if (literals.isEmpty()) { // if all values are less than lower bound, rows cannot match.
           return ROWS_CANNOT_MATCH;

diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java
@@ -32,6 +32,7 @@
 import org.apache.iceberg.types.Conversions;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.types.Types.StructType;
+import org.apache.iceberg.util.NaNUtil;
 
 import static org.apache.iceberg.expressions.Expressions.rewriteNot;
 
@@ -179,7 +180,7 @@ public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -202,7 +203,7 @@ public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -225,15 +226,20 @@ public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp > 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may include a file that contains rows that don't match.
+        if (cmp > 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -248,15 +254,20 @@ public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(field.type(), lowerBounds.get(id));
 
         int cmp = lit.comparator().compare(lower, lit.value());
-        if (cmp >= 0) {
+
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Without this NaN check below, we may include a file that contains rows that don't match.
+        if (cmp >= 0 && !NaNUtil.isNaN(lower)) {
           return ROWS_MUST_MATCH;
         }
       }
@@ -271,7 +282,7 @@ public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -304,13 +315,21 @@ public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_MUST_MATCH;
       }
 
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Thus we don't have visibility into the stats when lower bound is NaN.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_NOT_MATCH;
+        }
+
         int cmp = lit.comparator().compare(lower, lit.value());
         if (cmp > 0) {
           return ROWS_MUST_MATCH;
@@ -335,7 +354,7 @@ public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (canContainNulls(id)) {
+      if (canContainNulls(id) || canContainNaNs(id)) {
         return ROWS_MIGHT_NOT_MATCH;
       }
 
@@ -371,7 +390,7 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
       Types.NestedField field = struct.field(id);
       Preconditions.checkNotNull(field, "Cannot filter by nested column: %s", schema.findField(id));
 
-      if (containsNullsOnly(id)) {
+      if (containsNullsOnly(id) || containsNaNsOnly(id)) {
         return ROWS_MUST_MATCH;
       }
 
@@ -380,6 +399,14 @@ public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) {
       if (lowerBounds != null && lowerBounds.containsKey(id)) {
         T lower = Conversions.fromByteBuffer(struct.field(id).type(), lowerBounds.get(id));
 
+        // Due to the comparison implementation of ORC stats, for float/double columns in ORC files,
+        // if the first value in a file is NaN, metrics of this file will report NaN for both upper and
+        // lower bound despite that the column could contain non-NaN data.
+        // Thus we don't have visibility into the stats when lower bound is NaN.
+        if (NaNUtil.isNaN(lower)) {
+          return ROWS_MIGHT_NOT_MATCH;
+        }
+
         literals = literals.stream().filter(v -> ref.comparator().compare(lower, v) <= 0).collect(Collectors.toList());
         if (literals.isEmpty()) {  // if all values are less than lower bound, rows must match (notIn).
           return ROWS_MUST_MATCH;
@@ -406,6 +433,11 @@ private boolean canContainNulls(Integer id) {
       return nullCounts == null || (nullCounts.containsKey(id) && nullCounts.get(id) > 0);
     }
 
+    private boolean canContainNaNs(Integer id) {
+      // nan counts might be null for early version writers when nan counters are not populated.
+      return nanCounts != null && nanCounts.containsKey(id) && nanCounts.get(id) > 0;
+    }
+
     private boolean containsNullsOnly(Integer id) {
       return valueCounts != null && valueCounts.containsKey(id) &&
           nullCounts != null && nullCounts.containsKey(id) &&