Don't cache the RDD broadcast variable.

apache · Jul 30, 2014 · cc152fc · cc152fc
1 parent d256b45
commit cc152fc
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 28 deletions.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1225,7 +1225,7 @@ abstract class RDD[T: ClassTag](
    * might modify state of objects referenced in their closures. This is necessary in Hadoop
    * where the JobConf/Configuration object is not thread-safe.
    */
-  @transient private[spark] lazy val broadcasted: Broadcast[Array[Byte]] = {
+  @transient private[spark] def createBroadcastBinary(): Broadcast[Array[Byte]] = synchronized {
     val ser = SparkEnv.get.closureSerializer.newInstance()
     val bytes = ser.serialize(this).array()
     val size = Utils.bytesToString(bytes.length)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -694,18 +694,21 @@ class DAGScheduler(
     // Get our pending tasks and remember them in our pendingTasks entry
     stage.pendingTasks.clear()
     var tasks = ArrayBuffer[Task[_]]()
+    val broadcastRddBinary = stage.rdd.createBroadcastBinary()
     if (stage.isShuffleMap) {
       for (p <- 0 until stage.numPartitions if stage.outputLocs(p) == Nil) {
         val locs = getPreferredLocs(stage.rdd, p)
-        tasks += new ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep.get, p, locs)
+        val part = stage.rdd.partitions(p)
+        tasks += new ShuffleMapTask(stage.id, broadcastRddBinary, stage.shuffleDep.get, part, locs)
       }
     } else {
       // This is a final stage; figure out its job's missing partitions
       val job = stage.resultOfJob.get
       for (id <- 0 until job.numPartitions if !job.finished(id)) {
-        val partition = job.partitions(id)
-        val locs = getPreferredLocs(stage.rdd, partition)
-        tasks += new ResultTask(stage.id, stage.rdd, job.func, partition, locs, id)
+        val p: Int = job.partitions(id)
+        val part = stage.rdd.partitions(p)
+        val locs = getPreferredLocs(stage.rdd, p)
+        tasks += new ResultTask(stage.id, broadcastRddBinary, job.func, part, locs, id)
       }
     }
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -50,16 +50,6 @@ private[spark] class ResultTask[T, U](
   // TODO: Should we also broadcast func? For that we would need a place to
   // keep a reference to it (perhaps in DAGScheduler's job object).
 
-  def this(
-      stageId: Int,
-      rdd: RDD[T],
-      func: (TaskContext, Iterator[T]) => U,
-      partitionId: Int,
-      locs: Seq[TaskLocation],
-      outputId: Int) = {
-    this(stageId, rdd.broadcasted, func, rdd.partitions(partitionId), locs, outputId)
-  }
-
   @transient private[this] val preferredLocs: Seq[TaskLocation] = {
     if (locs == null) Nil else locs.toSet.toSeq
   }

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -47,15 +47,6 @@ private[spark] class ShuffleMapTask(
   // TODO: Should we also broadcast the ShuffleDependency? For that we would need a place to
   // keep a reference to it (perhaps in Stage).
 
-  def this(
-      stageId: Int,
-      rdd: RDD[_],
-      dep: ShuffleDependency[_, _, _],
-      partitionId: Int,
-      locs: Seq[TaskLocation]) = {
-    this(stageId, rdd.broadcasted, dep, rdd.partitions(partitionId), locs)
-  }
-
   /** A constructor used only in test suites. This does not require passing in an RDD. */
   def this(partitionId: Int) {
     this(0, null, null, new Partition { override def index = 0 }, null)

diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -146,7 +146,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
 
     // Test that GC causes broadcast task data cleanup after dereferencing the RDD.
     val postGCTester = new CleanerTester(sc,
-      broadcastIds = Seq(rdd.broadcasted.id, rdd.firstParent.broadcasted.id))
+      broadcastIds = Seq(rdd.createBroadcastBinary.id, rdd.firstParent.createBroadcastBinary.id))
     rdd = null
     runGC()
     postGCTester.assertCleanup()

diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -38,13 +38,14 @@ class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkConte
         sys.error("failed")
       }
     }
-    val func = (c: TaskContext, i: Iterator[String]) => i.next
-    val task = new ResultTask[String, String](0, rdd, func, 0, Seq(), 0)
+    val func = (c: TaskContext, i: Iterator[String]) => i.next()
+    val task = new ResultTask[String, String](
+      0, rdd.createBroadcastBinary(), func, rdd.partitions(0), Seq(), 0)
     intercept[RuntimeException] {
       task.run(0)
     }
     assert(completed === true)
   }
 
-  case class StubPartition(val index: Int) extends Partition
+  case class StubPartition(index: Int) extends Partition
 }