apache · karuppayya · Jun 12, 2020 · Jun 12, 2020 · Jun 12, 2020 · Jun 12, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2196,6 +2196,25 @@ object SQLConf {
       .checkValue(bit => bit >= 10 && bit <= 30, "The bit value must be in [10, 30].")
       .createWithDefault(16)
 
+  val SKIP_PARTIAL_AGGREGATE_ENABLED =
+    buildConf("spark.sql.aggregate.partialaggregate.skip.enabled")
+      .internal()
+      .doc("Avoid sort/spill to disk during partial aggregation")
+      .booleanConf
+      .createWithDefault(true)
+
+  val SKIP_PARTIAL_AGGREGATE_THRESHOLD =
+    buildConf("spark.sql.aggregate.partialaggregate.skip.threshold")
+      .internal()
+      .longConf
+      .createWithDefault(100000)
+
+  val SKIP_PARTIAL_AGGREGATE_RATIO =
+    buildConf("spark.sql.aggregate.partialaggregate.skip.ratio")
+      .internal()
+      .doubleConf
+      .createWithDefault(0.5)
+
   val AVRO_COMPRESSION_CODEC = buildConf("spark.sql.avro.compression.codec")
     .doc("Compression codec used in writing of AVRO files. Supported codecs: " +
       "uncompressed, deflate, snappy, bzip2 and xz. Default codec is snappy.")
@@ -2922,6 +2941,12 @@ class SQLConf extends Serializable with Logging {
 
   def fastHashAggregateRowMaxCapacityBit: Int = getConf(FAST_HASH_AGGREGATE_MAX_ROWS_CAPACITY_BIT)
 
+  def skipPartialAggregate: Boolean = getConf(SKIP_PARTIAL_AGGREGATE_ENABLED)
+
+  def skipPartialAggregateThreshold: Long = getConf(SKIP_PARTIAL_AGGREGATE_THRESHOLD)
+
+  def skipPartialAggregateRatio: Double = getConf(SKIP_PARTIAL_AGGREGATE_RATIO)
+
   def datetimeJava8ApiEnabled: Boolean = getConf(DATETIME_JAVA8API_ENABLED)
 
   def uiExplainMode: String = getConf(UI_EXPLAIN_MODE)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -353,4 +353,8 @@ object AggUtils {
 
     finalAndCompleteAggregate :: Nil
   }
+
+  def areAggExpressionsPartial(modes: Seq[AggregateMode]): Boolean = {
+    modes.nonEmpty && modes.forall(_ == Partial)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -63,6 +63,8 @@ case class HashAggregateExec(
 
   require(HashAggregateExec.supportsAggregate(aggregateBufferAttributes))
 
+  override def needStopCheck: Boolean = skipPartialAggregate
+
   override lazy val allAttributes: AttributeSeq =
     child.output ++ aggregateBufferAttributes ++ aggregateAttributes ++
       aggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
@@ -72,6 +74,8 @@ case class HashAggregateExec(
     "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in aggregation build"),
+    "partialAggSkipped" -> SQLMetrics.createMetric(sparkContext,
+      "number of skipped records for partial aggregates"),
     "avgHashProbe" ->
       SQLMetrics.createAverageMetric(sparkContext, "avg hash probe bucket list iters"))
 
@@ -409,6 +413,12 @@ case class HashAggregateExec(
   private var fastHashMapTerm: String = _
   private var isFastHashMapEnabled: Boolean = false
 
+  private var avoidSpillInPartialAggregateTerm: String = _
+  private val skipPartialAggregate = sqlContext.conf.skipPartialAggregate &&
+    AggUtils.areAggExpressionsPartial(modes) && find(_.isInstanceOf[ExpandExec]).isEmpty
+  private var rowCountTerm: String = _
+  private var outputFunc: String = _
+
   // whether a vectorized hashmap is used instead
   // we have decided to always use the row-based hashmap,
   // but the vectorized hashmap can still be switched on for testing and benchmarking purposes.
@@ -628,6 +638,8 @@ case class HashAggregateExec(
          |${consume(ctx, resultVars)}
        """.stripMargin
     }
+
+
     ctx.addNewFunction(funcName,
       s"""
          |private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm)
@@ -680,6 +692,10 @@ case class HashAggregateExec(
 
   private def doProduceWithKeys(ctx: CodegenContext): String = {
     val initAgg = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initAgg")
+    avoidSpillInPartialAggregateTerm = ctx.
+      addMutableState(CodeGenerator.JAVA_BOOLEAN, "avoidPartialAggregate")
+    val childrenConsumed = ctx.
+      addMutableState(CodeGenerator.JAVA_BOOLEAN, "childrenConsumed")
     if (sqlContext.conf.enableTwoLevelAggMap) {
       enableTwoLevelHashMap(ctx)
     } else if (sqlContext.conf.enableVectorizedHashMap) {
@@ -750,18 +766,19 @@ case class HashAggregateExec(
       finishRegularHashMap
     }
 
+    outputFunc = generateResultFunction(ctx)
     val doAggFuncName = ctx.addNewFunction(doAgg,
       s"""
          |private void $doAgg() throws java.io.IOException {
          |  ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
+         |  $childrenConsumed = true;
          |  $finishHashMap
          |}
        """.stripMargin)
 
     // generate code for output
     val keyTerm = ctx.freshName("aggKey")
     val bufferTerm = ctx.freshName("aggBuffer")
-    val outputFunc = generateResultFunction(ctx)
 
     def outputFromFastHashMap: String = {
       if (isFastHashMapEnabled) {
@@ -833,11 +850,18 @@ case class HashAggregateExec(
     s"""
        |if (!$initAgg) {
        |  $initAgg = true;
+       |  $avoidSpillInPartialAggregateTerm =
+       |   ${Utils.isTesting} && $skipPartialAggregate;
        |  $createFastHashMap
        |  $hashMapTerm = $thisPlan.createHashMap();
        |  long $beforeAgg = System.nanoTime();
        |  $doAggFuncName();
        |  $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS);
+       |  $shouldStopCheckCode;
+       |}
+       |if (!$childrenConsumed) {
+       |  $doAggFuncName();
+       |  $shouldStopCheckCode;
        |}
        |// output the result
        |$outputFromFastHashMap
@@ -877,44 +901,61 @@ case class HashAggregateExec(
       ("true", "true", "", "")
     }
 
-    val oomeClassName = classOf[SparkOutOfMemoryError].getName
+    val skipPartialAggregateThreshold = sqlContext.conf.skipPartialAggregateThreshold
+    val skipPartialAggRatio = sqlContext.conf.skipPartialAggregateRatio
 
+    val oomeClassName = classOf[SparkOutOfMemoryError].getName
+    val countTerm = ctx.addMutableState(CodeGenerator.JAVA_LONG, "count")
     val findOrInsertRegularHashMap: String =
       s"""
-         |// generate grouping key
-         |${unsafeRowKeyCode.code}
-         |int $unsafeRowKeyHash = ${unsafeRowKeyCode.value}.hashCode();
-         |if ($checkFallbackForBytesToBytesMap) {
-         |  // try to get the buffer from hash map
-         |  $unsafeRowBuffer =
-         |    $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, $unsafeRowKeyHash);
-         |}
-         |// Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
-         |// aggregation after processing all input rows.
-         |if ($unsafeRowBuffer == null) {
-         |  if ($sorterTerm == null) {
-         |    $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
-         |  } else {
-         |    $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
+         |if (!$avoidSpillInPartialAggregateTerm) {
+         |  // generate grouping key
+         |  ${unsafeRowKeyCode.code}
+         |  int $unsafeRowKeyHash = ${unsafeRowKeyCode.value}.hashCode();
+         |  if ($checkFallbackForBytesToBytesMap) {
+         |    // try to get the buffer from hash map
+         |    $unsafeRowBuffer =
+         |      $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, $unsafeRowKeyHash);
          |  }
-         |  $resetCounter
-         |  // the hash map had be spilled, it should have enough memory now,
-         |  // try to allocate buffer again.
-         |  $unsafeRowBuffer = $hashMapTerm.getAggregationBufferFromUnsafeRow(
-         |    $unsafeRowKeys, $unsafeRowKeyHash);
-         |  if ($unsafeRowBuffer == null) {
-         |    // failed to allocate the first page
-         |    throw new $oomeClassName("No enough memory for aggregation");
+         |  // Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
+         |  // aggregation after processing all input rows.
+         |  if ($unsafeRowBuffer == null && !$avoidSpillInPartialAggregateTerm) {
+         |    // If sort/spill to disk is disabled, nothing is done.
+         |    // Aggregation buffer is created later
+         |    $countTerm = $countTerm + $hashMapTerm.getNumRows();
+         |    boolean skipPartAgg =
+         |     !($rowCountTerm < $skipPartialAggregateThreshold)  &&
+         |      ($countTerm/$rowCountTerm) > $skipPartialAggRatio;
+         |    if ($skipPartialAggregate && skipPartAgg) {
+         |      $avoidSpillInPartialAggregateTerm = true;
+         |    } else {
+         |      if ($sorterTerm == null) {
+         |        $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
+         |      } else {
+         |        $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
+         |      }
+         |      $resetCounter
+         |      // the hash map had be spilled, it should have enough memory now,
+         |      // try to allocate buffer again.
+         |      $unsafeRowBuffer = $hashMapTerm.getAggregationBufferFromUnsafeRow(
+         |        $unsafeRowKeys, $unsafeRowKeyHash);
+         |      if ($unsafeRowBuffer == null) {
+         |        // failed to allocate the first page
+         |        throw new $oomeClassName("No enough memory for aggregation");
+         |      }
+         |    }
          |  }
          |}
        """.stripMargin
 
+    val partTerm = metricTerm(ctx, "partialAggSkipped")
+
     val findOrInsertHashMap: String = {
-      if (isFastHashMapEnabled) {
+      val insertCode = if (isFastHashMapEnabled) {
         // If fast hash map is on, we first generate code to probe and update the fast hash map.
         // If the probe is successful the corresponding fast row buffer will hold the mutable row.
         s"""
-           |if ($checkFallbackForGeneratedHashMap) {
+           |if ($checkFallbackForGeneratedHashMap && !$avoidSpillInPartialAggregateTerm) {
            |  ${fastRowKeys.map(_.code).mkString("\n")}
            |  if (${fastRowKeys.map("!" + _.isNull).mkString(" && ")}) {
            |    $fastRowBuffer = $fastHashMapTerm.findOrInsert(
@@ -923,12 +964,25 @@ case class HashAggregateExec(
            |}
            |// Cannot find the key in fast hash map, try regular hash map.
            |if ($fastRowBuffer == null) {
+           |  $countTerm = $countTerm + $fastHashMapTerm.getNumRows();
            |  $findOrInsertRegularHashMap
            |}
          """.stripMargin
       } else {
         findOrInsertRegularHashMap
       }
+      val initExpr = declFunctions.flatMap(f => f.initialValues)
+      val emptyBufferKeyCode = GenerateUnsafeProjection.createCode(ctx, initExpr)
+      s"""
+         |$insertCode
+         |// Create an empty aggregation buffer
+         |if ($avoidSpillInPartialAggregateTerm) {
+         |  ${unsafeRowKeyCode.code}
+         |  ${emptyBufferKeyCode.code}
+         |  $unsafeRowBuffer = ${emptyBufferKeyCode.value};
+         |  $partTerm.add(1);
+         |}
+         |""".stripMargin
     }
 
     val inputAttr = aggregateBufferAttributes ++ inputAttributes
@@ -1005,7 +1059,7 @@ case class HashAggregateExec(
     }
 
     val updateRowInHashMap: String = {
-      if (isFastHashMapEnabled) {
+      val updateRowinMap = if (isFastHashMapEnabled) {
         if (isVectorizedHashMapEnabled) {
           ctx.INPUT_ROW = fastRowBuffer
           val boundUpdateExprs = updateExprs.map { updateExprsForOneFunc =>
@@ -1080,6 +1134,12 @@ case class HashAggregateExec(
       } else {
         updateRowInRegularHashMap
       }
+      s"""
+        |$updateRowinMap
+        |if ($avoidSpillInPartialAggregateTerm) {
+        |  $outputFunc(${unsafeRowKeyCode.value}, $unsafeRowBuffer);
+        |}
+        |""".stripMargin
     }
 
     val declareRowBuffer: String = if (isFastHashMapEnabled) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala
@@ -136,6 +136,14 @@ abstract class HashMapGenerator(
      """.stripMargin
   }
 
+  protected final def generateNumRows(): String = {
+    s"""
+       |public int getNumRows() {
+       |  return batch.numRows();
+       |}
+     """.stripMargin
+  }
+
   protected final def genComputeHash(
       ctx: CodegenContext,
       input: String,

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite
-import org.apache.spark.sql.execution.aggregate.HashAggregateExec
+import org.apache.spark.sql.execution.aggregate.{AggUtils, HashAggregateExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
@@ -51,6 +51,51 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession
     assert(df.collect() === Array(Row(9, 4.5)))
   }
 
+  test(s"Avoid spill in partial aggregation" ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      // Create Dataframes
+      val data = Seq(("James", 1), ("James", 1), ("Phil", 1))
+      val aggDF = data.toDF("name", "values").groupBy("name").sum("values")
+      val partAggNode = aggDF.queryExecution.executedPlan.find {
+        case h: HashAggregateExec =>
+          AggUtils.areAggExpressionsPartial(h.aggregateExpressions.map(_.mode))
+        case _ => false
+      }
+      checkAnswer(aggDF, Seq(Row("James", 2), Row("Phil", 1)))
+      assert(partAggNode.isDefined,
+      "No HashAggregate node with partial aggregate expression found")
+      assert(partAggNode.get.metrics("partialAggSkipped").value == data.size,
+      "Partial aggregation got triggered in partial hash aggregate node")
+    }
+  }
+
+  test(s"Partial aggregation should not happen when no Aggregate expr" ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      val aggDF = testData2.select(sumDistinct($"a"))
+      val aggNodes = aggDF.queryExecution.executedPlan.collect {
+        case h: HashAggregateExec => h
+      }
+      checkAnswer(aggDF, Row(6))
+      assert(aggNodes.nonEmpty)
+      assert(aggNodes.forall(_.metrics("partialAggSkipped").value == 0))
+    }
+  }
+
+  test(s"Distinct: Partial aggregation should happen for" +
+    s" HashAggregate nodes performing partial Aggregate operations " ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      val aggDF = testData2.select(sumDistinct($"a"), sum($"b"))
+      val aggNodes = aggDF.queryExecution.executedPlan.collect {
+        case h: HashAggregateExec => h
+      }
+      val (baseNodes, other) = aggNodes.partition(_.child.isInstanceOf[SerializeFromObjectExec])
+      checkAnswer(aggDF, Row(6, 9))
+      assert(baseNodes.size == 1 )
+      assert(baseNodes.head.metrics("partialAggSkipped").value == testData2.count())
+      assert(other.forall(_.metrics("partialAggSkipped").value == 0))
+    }
+  }
+
   test("Aggregate with grouping keys should be included in WholeStageCodegen") {
     val df = spark.range(3).groupBy(col("id") * 2).count().orderBy(col("id") * 2)
     val plan = df.queryExecution.executedPlan