apache · maryannxue · Jun 20, 2018 · Jun 21, 2018 · cloud-fan · Jun 21, 2018
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ResolvedHint}
+import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, LogicalPlan, ResolvedHint}
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.storage.StorageLevel
@@ -97,7 +97,7 @@ class CacheManager extends Logging {
       val inMemoryRelation = InMemoryRelation(
         sparkSession.sessionState.conf.useCompression,
         sparkSession.sessionState.conf.columnBatchSize, storageLevel,
-        sparkSession.sessionState.executePlan(planToCache).executedPlan,
+        sparkSession.sessionState.executePlan(AnalysisBarrier(planToCache)).executedPlan,
         tableName,
         planToCache)
       cachedData.add(CachedData(planToCache, inMemoryRelation))
@@ -142,7 +142,7 @@ class CacheManager extends Logging {
         // Remove the cache entry before we create a new one, so that we can have a different
         // physical plan.
         it.remove()
-        val plan = spark.sessionState.executePlan(cd.plan).executedPlan
+        val plan = spark.sessionState.executePlan(AnalysisBarrier(cd.plan)).executedPlan
         val newCache = InMemoryRelation(
           cacheBuilder = cd.cachedRepresentation
             .cacheBuilder.copy(cachedPlan = plan)(_cachedColumnBuffers = null),

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -132,4 +132,19 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits
     df.unpersist()
     assert(df.storageLevel == StorageLevel.NONE)
   }
+
+  test("SPARK-24613 Cache with UDF could not be matched with subsequent dependent caches") {
+    val expensiveUDF = udf({x: Int => Thread.sleep(10000); x})
+    val df = spark.range(0, 10).toDF("a").withColumn("b", expensiveUDF($"a"))
+    val df2 = df.agg(sum(df("b")))
+
+    df.cache()
+    df.count()
+    df2.cache()
+
+    // udf has been evaluated during caching, and thus should not be re-evaluated here
+    failAfter(5 seconds) {
+      df2.collect()
+    }
+  }
 }