Make Statistics a case class.

apache · Jul 29, 2014 · 4ef0d26 · 4ef0d26
1 parent 3ba8f3e
commit 4ef0d26
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 14 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -26,18 +26,17 @@ import org.apache.spark.sql.catalyst.trees
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
   self: Product =>
 
-  // TODO: make a case class?
   /**
    * Estimates of various statistics.  The default estimation logic simply sums up the corresponding
    * statistic produced by the children.  To override this behavior, override `statistics` and
    * assign it a overriden version of `Statistics`.
    */
-  protected class Statistics {
-    lazy val childrenStats = children.map(_.statistics)
-    lazy val numTuples: Long = childrenStats.map(_.numTuples).sum
-    lazy val sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
-  }
+  case class Statistics(
+    numTuples: Long = childrenStats.map(_.numTuples).sum,
+    sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
+  )
   lazy val statistics: Statistics = new Statistics
+  lazy val childrenStats = children.map(_.statistics)
 
   /**
    * Returns the set of attributes that are referenced by this node

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -81,19 +81,19 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)
       }).asInstanceOf[this.type]
   }
 
-  override lazy val statistics = new Statistics {
+  @transient override lazy val statistics = Statistics(
     // If this is wrapping around ExistingRdd and no reasonable estimation logic is implemented,
     // return a default value.
-    override lazy val sizeInBytes: Long = {
+    sizeInBytes = {
       val defaultSum = childrenStats.map(_.sizeInBytes).sum
       alreadyPlanned match {
         // TODO: Instead of returning a default value here, find a way to return a meaningful
-        // estimate for RDDs. See PR 1238 for more discussions.
+        // size estimate for RDDs. See PR 1238 for more discussions.
         case e: ExistingRdd if defaultSum == 0 => statsDefaultSizeInBytes
         case _ => defaultSum
       }
     }
-  }
+  )
 
 }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -53,14 +53,14 @@ private[sql] case class ParquetRelation(
 
   self: Product =>
 
-  @transient override lazy val statistics = new Statistics {
+  @transient override lazy val statistics = Statistics(
     // TODO: investigate getting encoded column statistics in the parquet file?
-    override lazy val sizeInBytes: Long = {
+    sizeInBytes = {
       val hdfsPath = new Path(path)
       val fs = hdfsPath.getFileSystem(conf.getOrElse(ContextUtil.getConfiguration(new Job())))
       fs.getContentSummary(hdfsPath).getLength // TODO: in bytes or system-dependent?
     }
-  }
+  )
 
   /** Schema derived from ParquetFile */
   def parquetSchema: MessageType =

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -273,7 +273,7 @@ private[hive] case class MetastoreRelation
   @transient override lazy val statistics = new Statistics {
     // TODO: check if this estimate is valid for tables after partition pruning.
     // Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
-    override lazy val sizeInBytes: Long =
+    override val sizeInBytes: Long =
       maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path)
 
     private[this] def maybeGetSize(conf: HiveConf, size: String, path: Path): Long = {