diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 697f3dd051e8e..d9bd57686603f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1211,11 +1211,15 @@ abstract class RDD[T: ClassTag]( * Broadcasted copy of this RDD, used to dispatch tasks to executors. Note that we broadcast * the serialized copy of the RDD and for each task we will deserialize it, which means each * task gets a different copy of the RDD. This provides stronger isolation between tasks that - * might modify state of objects referenced in their closures. + * might modify state of objects referenced in their closures. This is necessary in Hadoop + * where the JobConf/Configuration object is not thread-safe. */ @transient private[spark] lazy val broadcasted: Broadcast[Array[Byte]] = { + // TODO: Warn users about very large RDDs. val ser = SparkEnv.get.closureSerializer.newInstance() - sc.broadcast(ser.serialize(this).array()) + val bytes = ser.serialize(this).array() + logDebug(s"Broadcasting RDD $id using ${bytes.length} bytes") + sc.broadcast(bytes) } private var storageLevel: StorageLevel = StorageLevel.NONE