Skip to content

Commit

Permalink
Make Statistics a case class.
Browse files Browse the repository at this point in the history
  • Loading branch information
concretevitamin committed Jul 29, 2014
1 parent 3ba8f3e commit 4ef0d26
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,17 @@ import org.apache.spark.sql.catalyst.trees
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
self: Product =>

// TODO: make a case class?
/**
* Estimates of various statistics. The default estimation logic simply sums up the corresponding
* statistic produced by the children. To override this behavior, override `statistics` and
* assign it a overriden version of `Statistics`.
*/
protected class Statistics {
lazy val childrenStats = children.map(_.statistics)
lazy val numTuples: Long = childrenStats.map(_.numTuples).sum
lazy val sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
}
case class Statistics(
numTuples: Long = childrenStats.map(_.numTuples).sum,
sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
)
lazy val statistics: Statistics = new Statistics
lazy val childrenStats = children.map(_.statistics)

/**
* Returns the set of attributes that are referenced by this node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,19 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)
}).asInstanceOf[this.type]
}

override lazy val statistics = new Statistics {
@transient override lazy val statistics = Statistics(
// If this is wrapping around ExistingRdd and no reasonable estimation logic is implemented,
// return a default value.
override lazy val sizeInBytes: Long = {
sizeInBytes = {
val defaultSum = childrenStats.map(_.sizeInBytes).sum
alreadyPlanned match {
// TODO: Instead of returning a default value here, find a way to return a meaningful
// estimate for RDDs. See PR 1238 for more discussions.
// size estimate for RDDs. See PR 1238 for more discussions.
case e: ExistingRdd if defaultSum == 0 => statsDefaultSizeInBytes
case _ => defaultSum
}
}
}
)

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ private[sql] case class ParquetRelation(

self: Product =>

@transient override lazy val statistics = new Statistics {
@transient override lazy val statistics = Statistics(
// TODO: investigate getting encoded column statistics in the parquet file?
override lazy val sizeInBytes: Long = {
sizeInBytes = {
val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(conf.getOrElse(ContextUtil.getConfiguration(new Job())))
fs.getContentSummary(hdfsPath).getLength // TODO: in bytes or system-dependent?
}
}
)

/** Schema derived from ParquetFile */
def parquetSchema: MessageType =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ private[hive] case class MetastoreRelation
@transient override lazy val statistics = new Statistics {
// TODO: check if this estimate is valid for tables after partition pruning.
// Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
override lazy val sizeInBytes: Long =
override val sizeInBytes: Long =
maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path)

private[this] def maybeGetSize(conf: HiveConf, size: String, path: Path): Long = {
Expand Down

0 comments on commit 4ef0d26

Please sign in to comment.