[SPARK-34215][SQL] Keep tables cached after truncation

### What changes were proposed in this pull request? Invoke `CatalogImpl.refreshTable()` instead of combination of `SessionCatalog.refreshTable()` + `uncacheQuery()`. This allows to clear cached table data while keeping the table cached. ### Why are the changes needed? 1. To improve user experience with Spark SQL 2. To be consistent to other commands, see apache#31206 ### Does this PR introduce _any_ user-facing change? Yes. Before: ```scala scala> sql("CREATE TABLE tbl (c0 int)") res1: org.apache.spark.sql.DataFrame = [] scala> sql("INSERT INTO tbl SELECT 0") res2: org.apache.spark.sql.DataFrame = [] scala> sql("CACHE TABLE tbl") res3: org.apache.spark.sql.DataFrame = [] scala> sql("SELECT * FROM tbl").show(false) +---+ |c0 | +---+ |0 | +---+ scala> spark.catalog.isCached("tbl") res5: Boolean = true scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = false ``` After: ```scala scala> sql("TRUNCATE TABLE tbl") res6: org.apache.spark.sql.DataFrame = [] scala> spark.catalog.isCached("tbl") res7: Boolean = true ``` ### How was this patch tested? Added new test to `CachedTableSuite`: ``` $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CachedTableSuite" $ build/sbt -Phive -Phive-thriftserver "test:testOnly *CatalogedDDLSuite" ``` Closes apache#31308 from MaxGekk/truncate-table-cached. Authored-by: Max Gekk <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
skestle · Feb 3, 2021 · 07e1f8c · 07e1f8c
1 parent 6ec8bf1
commit 07e1f8c
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 10 deletions.
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -49,6 +49,7 @@ license: |
     * `MSCK REPAIR TABLE`
     * `LOAD DATA`
     * `REFRESH TABLE`
+    * `TRUNCATE TABLE`
     * and the method `spark.catalog.refreshTable`
   In Spark 3.1 and earlier, table refreshing leaves dependents uncached.
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -561,16 +561,9 @@ case class TruncateTableCommand(
         }
       }
     }
-    // After deleting the data, invalidate the table to make sure we don't keep around a stale
-    // file relation in the metastore cache.
-    spark.sessionState.refreshTable(tableName.unquotedString)
-    // Also try to drop the contents of the table from the columnar cache
-    try {
-      spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier), cascade = true)
-    } catch {
-      case NonFatal(e) =>
-        log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
-    }
+    // After deleting the data, refresh the table to make sure we don't keep around a stale
+    // file relation in the metastore cache and cached table data in the cache manager.
+    spark.catalog.refreshTable(tableIdentWithDB)
 
     if (table.stats.nonEmpty) {
       // empty table after truncation

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -501,4 +501,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       }
     }
   }
+
+  test("SPARK-34215: keep table cached after truncation") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl (c0 int)")
+      sql("INSERT INTO tbl SELECT 0")
+      sql("CACHE TABLE tbl")
+      assert(spark.catalog.isCached("tbl"))
+      checkAnswer(sql("SELECT * FROM tbl"), Row(0))
+      sql("TRUNCATE TABLE tbl")
+      assert(spark.catalog.isCached("tbl"))
+      checkAnswer(sql("SELECT * FROM tbl"), Seq.empty)
+    }
+  }
 }