From f9d09b4ec20222b664d07bd3dda0661df187b134 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 14 Dec 2020 15:56:46 +0900 Subject: [PATCH] [SPARK-33770][SQL][TESTS] Fix the `ALTER TABLE .. DROP PARTITION` tests that delete files out of partition path Modify the tests that add partitions with `LOCATION`, and where the number of nested folders in `LOCATION` doesn't match to the number of partitioned columns. In that case, `ALTER TABLE .. DROP PARTITION` tries to access (delete) folder out of the "base" path in `LOCATION`. The problem belongs to Hive's MetaStore method `drop_partition_common`: https://github.com/apache/hive/blob/8696c82d07d303b6dbb69b4d443ab6f2b241b251/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveMetaStore.java#L4876 which tries to delete empty partition sub-folders recursively starting from the most deeper partition sub-folder up to the base folder. In the case when the number of sub-folder is not equal to the number of partitioned columns `part_vals.size()`, the method will try to list and delete folders out of the base path. To fix test failures like https://github.com/apache/spark/pull/30643#issuecomment-743774733: ``` org.apache.spark.sql.hive.execution.command.AlterTableAddPartitionSuite.ALTER TABLE .. ADD PARTITION Hive V1: SPARK-33521: universal type conversions of partition values sbt.ForkMain$ForkError: org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: File file:/home/jenkins/workspace/SparkPullRequestBuilder/target/tmp/spark-832cb19c-65fd-41f3-ae0b-937d76c07897 does not exist; at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:112) at org.apache.spark.sql.hive.HiveExternalCatalog.dropPartitions(HiveExternalCatalog.scala:1014) ... Caused by: sbt.ForkMain$ForkError: org.apache.hadoop.hive.metastore.api.MetaException: File file:/home/jenkins/workspace/SparkPullRequestBuilder/target/tmp/spark-832cb19c-65fd-41f3-ae0b-937d76c07897 does not exist at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.drop_partition_with_environment_context(HiveMetaStore.java:3381) at sun.reflect.GeneratedMethodAccessor304.invoke(Unknown Source) ``` The issue can be reproduced by the following steps: 1. Create a base folder, for example: `/Users/maximgekk/tmp/part-location` 2. Create a sub-folder in the base folder and drop permissions for it: ``` $ mkdir /Users/maximgekk/tmp/part-location/aaa $ chmod a-rwx chmod a-rwx /Users/maximgekk/tmp/part-location/aaa $ ls -al /Users/maximgekk/tmp/part-location total 0 drwxr-xr-x 3 maximgekk staff 96 Dec 13 18:42 . drwxr-xr-x 33 maximgekk staff 1056 Dec 13 18:32 .. d--------- 2 maximgekk staff 64 Dec 13 18:42 aaa ``` 3. Create a table with a partition folder in the base folder: ```sql spark-sql> create table tbl (id int) partitioned by (part0 int, part1 int); spark-sql> alter table tbl add partition (part0=1,part1=2) location '/Users/maximgekk/tmp/part-location/tbl'; ``` 4. Try to drop this partition: ``` spark-sql> alter table tbl drop partition (part0=1,part1=2); 20/12/13 18:46:07 ERROR HiveClientImpl: ====================== Attempt to drop the partition specs in table 'tbl' database 'default': Map(part0 -> 1, part1 -> 2) In this attempt, the following partitions have been dropped successfully: The remaining partitions have not been dropped: [1, 2] ====================== Error in query: org.apache.hadoop.hive.ql.metadata.HiveException: Error accessing file:/Users/maximgekk/tmp/part-location/aaa; org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: Error accessing file:/Users/maximgekk/tmp/part-location/aaa; ``` The command fails because it tries to access to the sub-folder `aaa` that is out of the partition path `/Users/maximgekk/tmp/part-location/tbl`. No By running the affected tests from local IDEA which does not have access to folders out of partition paths. Closes #30752 from MaxGekk/fix-drop-partition-location. Lead-authored-by: Max Gekk Co-authored-by: Maxim Gekk Signed-off-by: HyukjinKwon (cherry picked from commit 9160d59ae379910ca3bbd04ee25d336afff28abd) Signed-off-by: Max Gekk --- .../sql/catalyst/catalog/ExternalCatalogSuite.scala | 9 +++++++-- .../org/apache/spark/sql/hive/StatisticsSuite.scala | 12 ++++++++---- .../spark/sql/hive/execution/HiveDDLSuite.scala | 4 ++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index b376108399c1c..40448eceb36d2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -385,8 +385,8 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac partitionColumnNames = Seq("partCol1", "partCol2")) catalog.createTable(table, ignoreIfExists = false) - val newLocationPart1 = newUriForDatabase() - val newLocationPart2 = newUriForDatabase() + val newLocationPart1 = newUriForPartition(Seq("p1=1", "p2=2")) + val newLocationPart2 = newUriForPartition(Seq("p1=3", "p2=4")) val partition1 = CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"), @@ -969,6 +969,11 @@ abstract class CatalogTestUtils { def newUriForDatabase(): URI = new URI(Utils.createTempDir().toURI.toString.stripSuffix("/")) + def newUriForPartition(parts: Seq[String]): URI = { + val path = parts.foldLeft(Utils.createTempDir())(new java.io.File(_, _)) + new URI(path.toURI.toString.stripSuffix("/")) + } + def newDb(name: String): CatalogDatabase = { CatalogDatabase(name, name + " description", newUriForDatabase(), Map.empty) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index da79bd1e99c1e..7f15bf96dbb95 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -894,12 +894,16 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto assert(fetched1.get.colStats.size == 2) withTempPaths(numPaths = 2) { case Seq(dir1, dir2) => - val file1 = new File(dir1 + "/data") + val partDir1 = new File(new File(dir1, "ds=2008-04-09"), "hr=11") + val file1 = new File(partDir1, "data") + file1.getParentFile.mkdirs() val writer1 = new PrintWriter(file1) writer1.write("1,a") writer1.close() - val file2 = new File(dir2 + "/data") + val partDir2 = new File(new File(dir2, "ds=2008-04-09"), "hr=12") + val file2 = new File(partDir2, "data") + file2.getParentFile.mkdirs() val writer2 = new PrintWriter(file2) writer2.write("1,a") writer2.close() @@ -908,8 +912,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql( s""" |ALTER TABLE $table ADD - |PARTITION (ds='2008-04-09', hr='11') LOCATION '${dir1.toURI.toString}' - |PARTITION (ds='2008-04-09', hr='12') LOCATION '${dir2.toURI.toString}' + |PARTITION (ds='2008-04-09', hr='11') LOCATION '${partDir1.toURI.toString}' + |PARTITION (ds='2008-04-09', hr='12') LOCATION '${partDir1.toURI.toString}' """.stripMargin) if (autoUpdate) { val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 35e83ae276672..bdaef02b09eac 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -455,8 +455,8 @@ class HiveDDLSuite val tab = "tab_with_partitions" withTempDir { tmpDir => val basePath = new File(tmpDir.getCanonicalPath) - val part1Path = new File(basePath + "/part1") - val part2Path = new File(basePath + "/part2") + val part1Path = new File(new File(basePath, "part10"), "part11") + val part2Path = new File(new File(basePath, "part20"), "part21") val dirSet = part1Path :: part2Path :: Nil // Before data insertion, all the directory are empty