From 85872f6e2fbb2385793b645a629ed26ee2e98cbc Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 23 Oct 2014 14:17:55 -0700
Subject: [PATCH] Allow schema calculation to be lazy, but ensure its available
 on executors.

---
 .../scala/org/apache/spark/sql/SchemaRDD.scala   | 16 ++++++++++------
 .../org/apache/spark/sql/hive/HiveContext.scala  |  7 +------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index 3e59becb0d143..a3212ff9ec6d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -117,14 +117,18 @@ class SchemaRDD(
 
   override def getPartitions: Array[Partition] = firstParent[Row].partitions
 
-  override protected def getDependencies: Seq[Dependency[_]] =
+  override protected def getDependencies: Seq[Dependency[_]] = {
+    schema // Force reification of the schema so it is available on executors.
+
     List(new OneToOneDependency(queryExecution.toRdd))
+  }
 
-  /** Returns the schema of this SchemaRDD (represented by a [[StructType]]).
-    *
-    * @group schema
-    */
-  val schema: StructType = queryExecution.analyzed.schema
+  /**
+   * Returns the schema of this SchemaRDD (represented by a [[StructType]]).
+   *
+   * @group schema
+   */
+  lazy val schema: StructType = queryExecution.analyzed.schema
 
   // =======================================================================
   // Query DSL
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 15cc6067ecb05..9915a17882488 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -371,11 +371,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   /** Extends QueryExecution with hive specific features. */
   protected[sql] abstract class QueryExecution extends super.QueryExecution {
 
-    override lazy val toRdd: RDD[Row] = {
-      val schema = StructType.fromAttributes(logical.output)
-      executedPlan.execute().map(ScalaReflection.convertRowToScala(_, schema))
-    }
-
     protected val primitiveTypes =
       Seq(StringType, IntegerType, LongType, DoubleType, FloatType, BooleanType, ByteType,
         ShortType, DateType, TimestampType, BinaryType)
@@ -433,7 +428,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         command.executeCollect().map(_.head.toString)
 
       case other =>
-        val result: Seq[Seq[Any]] = toRdd.collect().toSeq
+        val result: Seq[Seq[Any]] = toRdd.map(_.copy()).collect().toSeq
         // We need the types so we can output struct field names
         val types = analyzed.output.map(_.dataType)
         // Reformat to match hive tab delimited output.