Skip to content

Commit

Permalink
[SPARK-26873][SQL] Use a consistent timestamp to build Hadoop Job IDs.
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Updates FileFormatWriter to create a consistent Hadoop Job ID for a write.

## How was this patch tested?

Existing tests for regressions.

Closes apache#23777 from rdblue/SPARK-26873-fix-file-format-writer-job-ids.

Authored-by: Ryan Blue <[email protected]>
Signed-off-by: Marcelo Vanzin <[email protected]>
  • Loading branch information
rdblue authored and Marcelo Vanzin committed Feb 14, 2019
1 parent 2228ee5 commit 33334e2
Showing 1 changed file with 4 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,14 @@ object FileFormatWriter extends Logging {
rdd
}

val jobIdInstant = new Date().getTime
val ret = new Array[WriteTaskResult](rddWithNonEmptyPartitions.partitions.length)
sparkSession.sparkContext.runJob(
rddWithNonEmptyPartitions,
(taskContext: TaskContext, iter: Iterator[InternalRow]) => {
executeTask(
description = description,
jobIdInstant = jobIdInstant,
sparkStageId = taskContext.stageId(),
sparkPartitionId = taskContext.partitionId(),
sparkAttemptNumber = taskContext.taskAttemptId().toInt & Integer.MAX_VALUE,
Expand Down Expand Up @@ -200,13 +202,14 @@ object FileFormatWriter extends Logging {
/** Writes data out in a single Spark task. */
private def executeTask(
description: WriteJobDescription,
jobIdInstant: Long,
sparkStageId: Int,
sparkPartitionId: Int,
sparkAttemptNumber: Int,
committer: FileCommitProtocol,
iterator: Iterator[InternalRow]): WriteTaskResult = {

val jobId = SparkHadoopWriterUtils.createJobID(new Date, sparkStageId)
val jobId = SparkHadoopWriterUtils.createJobID(new Date(jobIdInstant), sparkStageId)
val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId)
val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber)

Expand Down

0 comments on commit 33334e2

Please sign in to comment.