apache · mshtelma · Mar 26, 2018 · Apr 3, 2018 · Mar 26, 2018 · Mar 26, 2018
diff --git a/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/.../scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -427,7 +427,11 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
     // return the filter selectivity.  Without advanced statistics such as histograms,
     // we have to assume uniform distribution.
-    Some(math.min(newNdv.toDouble / ndv.toDouble, 1.0))
+    if (ndv.toDouble != 0) {
 1.0 / bin.ndv.toDouble 
 1.0 / bin.ndv.toDouble 
+      Some(math.min(newNdv.toDouble / ndv.toDouble, 1.0))
+    } else {
+      Some(0.0)
+    }
   }
 
   /**

diff --git a/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/.../src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -357,6 +357,17 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
+  test("evaluateInSet with all zeros") {
+    validateEstimatedStats(
+      Filter(InSet(attrString, Set(3, 4, 5)),
+        StatsTestPlan(Seq(attrString), 10,
+          AttributeMap(Seq(attrString ->
+            ColumnStat(distinctCount = Some(0), min = Some(0), max = Some(0),
+              nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
+      Seq(attrString -> ColumnStat(distinctCount = Some(0))),
+      expectedRowCount = 0)
+  }
+
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),