From 89d4cc1d59d193eead9dfebd4244f29cf270eaad Mon Sep 17 00:00:00 2001 From: Bo Zhang Date: Mon, 21 Feb 2022 11:30:28 +0800 Subject: [PATCH] [SPARK-38236][SQL] Treat table location as absolute when the first letter of its path is slash in create/alter table After https://github.com/apache/spark/pull/28527, we change to create table under the database location when the table location is relative. However the criteria to determine if a table location is relative/absolute is `URI.isAbsolute`, which basically checks if the table location URI has a scheme defined. So table URIs like `/table/path` are treated as relative and the scheme and authority of the database location URI are used to create the table. For example, when the database location URI is `s3a://bucket/db`, the table will be created at `s3a://bucket/table/path`, while it should be created under the file system defined in `SessionCatalog.hadoopConf` instead. This change fixes that by treating table location as absolute when the first letter of its path is slash. This also applies to alter table. This is to fix the behavior described above. Yes. When users try to create/alter a table with a location that starts with a slash but without a scheme defined, the table will be created under/altered to the file system defined in `SessionCatalog.hadoopConf`, instead of the one defined in the database location URI. Updated unit tests. Closes #35462 from bozhang2820/spark-31709. Authored-by: Bo Zhang Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/avro/AvroSuite.scala | 2 ++ .../mllib/util/MLlibTestSparkContext.scala | 2 ++ .../sql/catalyst/catalog/SessionCatalog.scala | 2 ++ .../results/show-create-table.sql.out | 4 +-- .../v2/V2SessionCatalogSuite.scala | 25 ++++++++++++++----- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index 2faa8e6e027fc..7ed9d78813527 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -68,6 +68,8 @@ abstract class AvroSuite override protected def beforeAll(): Unit = { super.beforeAll() + // initialize SessionCatalog here so it has a clean hadoopConf + spark.sessionState.catalog spark.conf.set(SQLConf.FILES_MAX_PARTITION_BYTES.key, 1024) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala index 5eb128abacdb9..3a7040d470c05 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLlibTestSparkContext.scala @@ -40,6 +40,8 @@ trait MLlibTestSparkContext extends TempDirectory { self: Suite => .appName("MLlibUnitTest") .getOrCreate() sc = spark.sparkContext + // initialize SessionCatalog here so it has a clean hadoopConf + spark.sessionState.catalog checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString sc.setCheckpointDir(checkpointDir) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a4694a600e5a5..314997a72efc8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -393,6 +393,8 @@ class SessionCatalog( private def makeQualifiedTablePath(locationUri: URI, database: String): URI = { if (locationUri.isAbsolute) { locationUri + } else if (new Path(locationUri).isAbsolute) { + makeQualifiedPath(locationUri) } else { val dbName = formatDatabaseName(database) val dbLocation = makeQualifiedDBPath(getDatabaseMetadata(dbName).locationUri) diff --git a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out index 88fef8908638e..e72d211314695 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out @@ -80,7 +80,7 @@ CREATE TABLE `default`.`tbl` ( `b` STRING, `c` INT) USING parquet -LOCATION 'file:/path/to/table' +LOCATION 'file:///path/to/table' -- !query @@ -110,7 +110,7 @@ CREATE TABLE `default`.`tbl` ( `b` STRING, `c` INT) USING parquet -LOCATION 'file:/path/to/table' +LOCATION 'file:///path/to/table' -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala index 1a4f08418f8d3..d6895187cd622 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala @@ -29,7 +29,7 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, NamespaceChange, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, NamespaceChange, SupportsNamespaces, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -60,7 +60,8 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { super.beforeAll() val catalog = newCatalog() catalog.createNamespace(Array("db"), emptyProps) - catalog.createNamespace(Array("db2"), emptyProps) + catalog.createNamespace(Array("db2"), + Map(SupportsNamespaces.PROP_LOCATION -> "file:///db2.db").asJava) catalog.createNamespace(Array("ns"), emptyProps) catalog.createNamespace(Array("ns2"), emptyProps) } @@ -186,10 +187,17 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { assert(t2.catalogTable.location === makeQualifiedPathWithWarehouse("db.db/relative/path")) catalog.dropTable(testIdent) - // absolute path + // absolute path without scheme properties.put(TableCatalog.PROP_LOCATION, "/absolute/path") val t3 = catalog.createTable(testIdent, schema, Array.empty, properties).asInstanceOf[V1Table] - assert(t3.catalogTable.location.toString === "file:/absolute/path") + assert(t3.catalogTable.location.toString === "file:///absolute/path") + catalog.dropTable(testIdent) + + // absolute path with scheme + properties.put(TableCatalog.PROP_LOCATION, "file:/absolute/path") + val t4 = catalog.createTable(testIdent, schema, Array.empty, properties).asInstanceOf[V1Table] + assert(t4.catalogTable.location.toString === "file:/absolute/path") + catalog.dropTable(testIdent) } test("tableExists") { @@ -686,10 +694,15 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { TableChange.setProperty(TableCatalog.PROP_LOCATION, "relative/path")).asInstanceOf[V1Table] assert(t2.catalogTable.location === makeQualifiedPathWithWarehouse("db.db/relative/path")) - // absolute path + // absolute path without scheme val t3 = catalog.alterTable(testIdent, TableChange.setProperty(TableCatalog.PROP_LOCATION, "/absolute/path")).asInstanceOf[V1Table] - assert(t3.catalogTable.location.toString === "file:/absolute/path") + assert(t3.catalogTable.location.toString === "file:///absolute/path") + + // absolute path with scheme + val t4 = catalog.alterTable(testIdent, TableChange.setProperty( + TableCatalog.PROP_LOCATION, "file:/absolute/path")).asInstanceOf[V1Table] + assert(t4.catalogTable.location.toString === "file:/absolute/path") } test("dropTable") {