apache · liancheng · May 23, 2014 · May 23, 2014 · May 24, 2014 · rxin
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -206,17 +206,24 @@ class SQLContext(@transient val sparkContext: SparkContext)
      * final desired output requires complex expressions to be evaluated or when columns can be
      * further eliminated out after filtering has been done.
      *
+     * The `prunePushedDownFilter` is used to remove those filters that can be removed by the filter
+     * pushdown optimization.
+     *
      * The required attributes for both filtering and expression evaluation are passed to the
      * provided `scanBuilder` function so that it can avoid unnecessary column materialization.
      */
     def pruneFilterProject(
         projectList: Seq[NamedExpression],
         filterPredicates: Seq[Expression],
+        prunePushedDownFilter: Option[Expression => Boolean],
         scanBuilder: Seq[Attribute] => SparkPlan): SparkPlan = {
 
       val projectSet = projectList.flatMap(_.references).toSet
       val filterSet = filterPredicates.flatMap(_.references).toSet
-      val filterCondition = filterPredicates.reduceLeftOption(And)
+      val filterCondition = prunePushedDownFilter
+        .map(filterPredicates.filter)
+        .getOrElse(filterPredicates)
+        .reduceLeftOption(And)
 
       // Right now we still use a projection even if the only evaluation is applying an alias
       // to a column.  Since this is a no-op, it could be avoided. However, using this

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -141,31 +141,31 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.InsertIntoTable(table: ParquetRelation, partition, child, overwrite) =>
         InsertIntoParquetTable(table, planLater(child), overwrite)(sparkContext) :: Nil
       case PhysicalOperation(projectList, filters: Seq[Expression], relation: ParquetRelation) => {
-        val remainingFilters =
+        val prunePushedDownFilter =
           if (sparkContext.conf.getBoolean(ParquetFilters.PARQUET_FILTER_PUSHDOWN_ENABLED, true)) {
-            filters.filter {
-              // Note: filters cannot be pushed down to Parquet if they contain more complex
-              // expressions than simple "Attribute cmp Literal" comparisons. Here we remove
-              // all filters that have been pushed down. Note that a predicate such as
-              // "(A AND B) OR C" can result in "A OR C" being pushed down.
-              filter =>
-                val recordFilter = ParquetFilters.createFilter(filter)
-                if (!recordFilter.isDefined) {
-                  // First case: the pushdown did not result in any record filter.
-                  true
-                } else {
-                  // Second case: a record filter was created; here we are conservative in
-                  // the sense that even if "A" was pushed and we check for "A AND B" we
-                  // still want to keep "A AND B" in the higher-level filter, not just "B".
-                  !ParquetFilters.findExpression(recordFilter.get, filter).isDefined
-                }
-            }
+            // Note: filters cannot be pushed down to Parquet if they contain more complex
+            // expressions than simple "Attribute cmp Literal" comparisons. Here we remove
+            // all filters that have been pushed down. Note that a predicate such as
+            // "(A AND B) OR C" can result in "A OR C" being pushed down.
+            Some((filter: Expression) => {
+              val recordFilter = ParquetFilters.createFilter(filter)
+              if (!recordFilter.isDefined) {
+                // First case: the pushdown did not result in any record filter.
+                true
+              } else {
+                // Second case: a record filter was created; here we are conservative in
+                // the sense that even if "A" was pushed and we check for "A AND B" we
+                // still want to keep "A AND B" in the higher-level filter, not just "B".
+                !ParquetFilters.findExpression(recordFilter.get, filter).isDefined
+              }
+            })
           } else {
-            filters
+            None
           }
         pruneFilterProject(
           projectList,
-          remainingFilters,
+          filters,
+          prunePushedDownFilter,
           ParquetTableScan(_, relation, filters)(sparkContext)) :: Nil
       }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -358,5 +358,9 @@ class ParquetQuerySuite extends QueryTest with FunSuite with BeforeAndAfterAll {
     assert(stringResult(0).getString(2) == "100", "stringvalue incorrect")
     assert(stringResult(0).getInt(1) === 100)
   }
-}
 
+  test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
+    val query = sql(s"SELECT mystring FROM testfiltersource WHERE myint < 10")
+    assert(query.collect().size === 10)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -69,6 +69,7 @@ private[hive] trait HiveStrategies {
         pruneFilterProject(
           projectList,
           otherPredicates,
+          None,
           HiveTableScan(_, relation, pruningPredicates.reduceLeftOption(And))(hiveContext)) :: Nil
       case _ =>
         Nil