[SPARK-49991][SQL] Make HadoopMapReduceCommitProtocol respect 'mapred…

…uce.output.basename' to generate file names
yaooqinn · Oct 16, 2024 · 2c9909e · 2c9909e
1 parent 861b5e9
commit 2c9909e
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 1 deletion.
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -168,7 +168,8 @@ class HadoopMapReduceCommitProtocol(
     // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
     // the file name is fine and won't overflow.
     val split = taskContext.getTaskAttemptID.getTaskID.getId
-    f"${spec.prefix}part-$split%05d-$jobId${spec.suffix}"
+    val basename = taskContext.getConfiguration.get("mapreduce.output.basename", "part")
+    f"${spec.prefix}$basename-$split%05d-$jobId${spec.suffix}"
   }
 
   override def setupJob(jobContext: JobContext): Unit = {

diff --git a/...re/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/...re/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -1585,6 +1585,17 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
       }
     }
   }
+
+  test("basename") {
+    withTempPath { dir =>
+      withSQLConf("mapreduce.output.basename" -> "apachespark") {
+        spark.range(1).coalesce(1).write.parquet(dir.getCanonicalPath)
+        val df = spark.read.parquet(dir.getCanonicalPath)
+        assert(df.inputFiles.head.contains("apachespark"))
+        checkAnswer(spark.read.parquet(dir.getCanonicalPath), Row(0))
+      }
+    }
+  }
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)