Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-35362][SQL] Update null count in the column stats for UNION operator stats estimation #32494

Closed
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
attr: Attribute,
isNull: Boolean,
update: Boolean): Option[Double] = {
if (!colStatsMap.contains(attr) || !colStatsMap(attr).hasCountStats) {
if (!colStatsMap.contains(attr) || colStatsMap(attr).nullCount.isEmpty) {
logDebug("[CBO] No statistics for " + attr)
return None
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,20 @@ object UnionEstimation {
val outputAttrStats = new ArrayBuffer[(Attribute, ColumnStat)]()
attrToComputeMinMaxStats.foreach {
case (attrs, outputIndex) =>
var nullCount: Option[BigInt] = None
val dataType = unionOutput(outputIndex).dataType
val statComparator = createStatComparator(dataType)
val minMaxValue = attrs.zipWithIndex.foldLeft[(Option[Any], Option[Any])]((None, None)) {
case ((minVal, maxVal), (attr, childIndex)) =>
val colStat = union.children(childIndex).stats.attributeStats(attr)
// Update null count
nullCount = if (nullCount.isDefined && colStat.nullCount.isDefined) {
Some(nullCount.get + colStat.nullCount.get)
} else if (colStat.nullCount.isDefined) {
colStat.nullCount
} else {
nullCount
}
val min = if (minVal.isEmpty || statComparator(colStat.min.get, minVal.get)) {
colStat.min
} else {
Expand All @@ -103,10 +112,11 @@ object UnionEstimation {
}
(min, max)
}
val newStat = ColumnStat(min = minMaxValue._1, max = minMaxValue._2)
val newStat = ColumnStat(min = minMaxValue._1, max = minMaxValue._2,
nullCount = nullCount)
outputAttrStats += unionOutput(outputIndex) -> newStat
}
AttributeMap(outputAttrStats.toSeq)
AttributeMap(outputAttrStats)
} else {
AttributeMap.empty[ColumnStat]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ class BasicStatsEstimationSuite extends PlanTest with StatsEstimationTestBase {
val rowCount = Some(plan.rowCount * childrenSize)
val attributeStats = AttributeMap(
Seq(
attribute -> ColumnStat(min = Some(1), max = Some(10))))
attribute -> ColumnStat(min = Some(1), max = Some(10), nullCount = Some(0))))
checkStats(
union,
expectedStatsCboOn = Statistics(sizeInBytes = sizeInBytes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,14 @@ class UnionEstimationSuite extends StatsEstimationTestBase {
distinctCount = Some(2),
min = Some(1),
max = Some(4),
nullCount = Some(0),
nullCount = Some(1),
avgLen = Some(4),
maxLen = Some(4)),
attrDouble -> ColumnStat(
distinctCount = Some(2),
min = Some(5.0),
max = Some(4.0),
nullCount = Some(0),
nullCount = Some(2),
avgLen = Some(4),
maxLen = Some(4)),
attrShort -> ColumnStat(min = Some(s1), max = Some(s2)),
Expand All @@ -96,14 +96,14 @@ class UnionEstimationSuite extends StatsEstimationTestBase {
distinctCount = Some(2),
min = Some(3),
max = Some(6),
nullCount = Some(0),
nullCount = Some(1),
avgLen = Some(8),
maxLen = Some(8)),
AttributeReference("cdouble1", DoubleType)() -> ColumnStat(
distinctCount = Some(2),
min = Some(2.0),
max = Some(7.0),
nullCount = Some(0),
nullCount = Some(2),
avgLen = Some(8),
maxLen = Some(8)),
AttributeReference("cshort1", ShortType)() -> ColumnStat(min = Some(s3), max = Some(s4)),
Expand Down Expand Up @@ -139,8 +139,8 @@ class UnionEstimationSuite extends StatsEstimationTestBase {
rowCount = Some(4),
attributeStats = AttributeMap(
Seq(
attrInt -> ColumnStat(min = Some(1), max = Some(6)),
attrDouble -> ColumnStat(min = Some(2.0), max = Some(7.0)),
attrInt -> ColumnStat(min = Some(1), max = Some(6), nullCount = Some(2)),
attrDouble -> ColumnStat(min = Some(2.0), max = Some(7.0), nullCount = Some(4)),
attrShort -> ColumnStat(min = Some(s1), max = Some(s4)),
attrLong -> ColumnStat(min = Some(1L), max = Some(6L)),
attrByte -> ColumnStat(min = Some(b1), max = Some(b4)),
Expand Down