apache · eatoncys · Aug 2, 2017 · Aug 2, 2017 · Aug 7, 2017 · Aug 7, 2017
diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -355,6 +355,18 @@ class CodegenContext {
    */
   private val placeHolderToComments = new mutable.HashMap[String, String]
 
+  /**
+   * Returns if the length of codegen function is too long or not
+   * It will count the lines of every codegen function, if there is a function of length
+   * greater than spark.sql.codegen.MaxFunctionLength, it will return true.
+   */
+  def existTooLongFunction(): Boolean = {
+    classFunctions.exists { case (className, functions) =>
+      functions.exists{ case (name, code) =>
+        CodeFormatter.stripExtraNewLines(code).count(_ == '\n') > SQLConf.get.maxFunctionLength
+      }
+    }
+  }
   /**
    * Returns a term name that is unique within this instance of a `CodegenContext`.
    */

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -572,6 +572,13 @@ object SQLConf {
       "disable logging or -1 to apply no limit.")
     .createWithDefault(1000)
 
+  val WHOLESTAGE_MAX_FUNCTION_LEN = buildConf("spark.sql.codegen.MaxFunctionLength")
+    .internal()
+    .doc("The maximum number of function length that will be supported before" +
+    " deactivating whole-stage codegen.")
+    .intConf
+    .createWithDefault(1500)
+
   val FILES_MAX_PARTITION_BYTES = buildConf("spark.sql.files.maxPartitionBytes")
     .doc("The maximum number of bytes to pack into a single partition when reading files.")
     .longConf
@@ -1014,6 +1021,8 @@ class SQLConf extends Serializable with Logging {
 
   def loggingMaxLinesForCodegen: Int = getConf(CODEGEN_LOGGING_MAX_LINES)
 
+  def maxFunctionLength: Int = getConf(WHOLESTAGE_MAX_FUNCTION_LEN)
+
   def tableRelationCacheSize: Int =
     getConf(StaticSQLConf.FILESOURCE_TABLE_RELATION_CACHE_SIZE)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -370,6 +370,12 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
 
   override def doExecute(): RDD[InternalRow] = {
     val (ctx, cleanedSource) = doCodeGen()
+    val existLongFunction = ctx.existTooLongFunction
+    if (existLongFunction) {
+      logWarning(s"Function is too long, Whole-stage codegen disabled for this plan:\n "
+        + s"$treeString")
+      return child.execute()
+    }
     // try to compile and fallback if it failed
     try {
       CodeGenerator.compile(cleanedSource)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala
@@ -301,6 +301,61 @@ class AggregateBenchmark extends BenchmarkBase {
     */
   }
 
+  ignore("max function length of wholestagecodegen") {
+    val N = 20 << 15
+
+    val benchmark = new Benchmark("max function length of wholestagecodegen", N)
+    def f(): Unit = sparkSession.range(N)
+      .selectExpr(
+        "id",
+        "(id & 1023) as k1",
+        "cast(id & 1023 as double) as k2",
+        "cast(id & 1023 as int) as k3",
+        "case when id > 100 and id <= 200 then 1 else 0 end as v1",
+        "case when id > 200 and id <= 300 then 1 else 0 end as v2",
+        "case when id > 300 and id <= 400 then 1 else 0 end as v3",
+        "case when id > 400 and id <= 500 then 1 else 0 end as v4",
+        "case when id > 500 and id <= 600 then 1 else 0 end as v5",
+        "case when id > 600 and id <= 700 then 1 else 0 end as v6",
+        "case when id > 700 and id <= 800 then 1 else 0 end as v7",
+        "case when id > 800 and id <= 900 then 1 else 0 end as v8",
+        "case when id > 900 and id <= 1000 then 1 else 0 end as v9",
+        "case when id > 1000 and id <= 1100 then 1 else 0 end as v10",
+        "case when id > 1100 and id <= 1200 then 1 else 0 end as v11",
+        "case when id > 1200 and id <= 1300 then 1 else 0 end as v12",
+        "case when id > 1300 and id <= 1400 then 1 else 0 end as v13",
+        "case when id > 1400 and id <= 1500 then 1 else 0 end as v14",
+        "case when id > 1500 and id <= 1600 then 1 else 0 end as v15",
+        "case when id > 1600 and id <= 1700 then 1 else 0 end as v16",
+        "case when id > 1700 and id <= 1800 then 1 else 0 end as v17",
+        "case when id > 1800 and id <= 1900 then 1 else 0 end as v18")
+      .groupBy("k1", "k2", "k3")
+      .sum()
+      .collect()
+
+    benchmark.addCase(s"codegen = F") { iter =>
+      sparkSession.conf.set("spark.sql.codegen.wholeStage", "false")
+      f()
+    }
+
+    benchmark.addCase(s"codegen = T") { iter =>
+      sparkSession.conf.set("spark.sql.codegen.wholeStage", "true")
+      sparkSession.conf.set("spark.sql.codegen.MaxFunctionLength", "10000")
+      f()
+    }
+
+    benchmark.run()
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_111-b14 on Windows 7 6.1
+    Intel64 Family 6 Model 58 Stepping 9, GenuineIntel
+    max function length of wholestagecodegen: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns) Relative
+    ----------------------------------------------------------------------------------------------
+    codegen = F                                    443 /  507          1.5         676.0     1.0X
+    codegen = T                                   3279 / 3283          0.2        5002.6     0.1X
+     */
+  }
+
 
   ignore("cube") {
     val N = 5 << 20