Skip to content

Commit

Permalink
Use defined timezone on write for formats that need TZ info (#665)
Browse files Browse the repository at this point in the history
* Use defined timezone on write for formats that need TZ info

* Add test
  • Loading branch information
srowen authored Oct 10, 2023
1 parent b2611bd commit 6969262
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Defaults to `false`. New in 0.11.0.
columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT),
including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used.
* `timezone`: identifier of timezone to be used when reading timestamps without a timezone specified. New in 0.16.0.
* `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType`
columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
Expand All @@ -86,6 +87,7 @@ When writing files the API accepts several options:
* `timestampFormat`: Controls the format used to write `TimestampType` format columns.
The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used.
* `timezone`: identifier of timezone to be used when writing timestamps without a timezone specified. New in 0.16.0.
* `dateFormat`: Controls the format used to write `DateType` format columns.
The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package com.databricks.spark.xml.parsers

import java.sql.{Date, Timestamp}
import java.time.ZoneId
import java.time.format.DateTimeFormatter

import javax.xml.stream.XMLStreamWriter
Expand Down Expand Up @@ -85,7 +86,8 @@ private[xml] object StaxXmlGenerator {
case (StringType, v: String) => writer.writeCharacters(v)
case (TimestampType, v: Timestamp) =>
val formatter = options.timestampFormat.map(DateTimeFormatter.ofPattern).
getOrElse(DateTimeFormatter.ISO_INSTANT)
getOrElse(DateTimeFormatter.ISO_INSTANT).
withZone(options.timezone.map(ZoneId.of).orNull)
writer.writeCharacters(formatter.format(v.toInstant()))
case (DateType, v: Date) =>
val formatter = options.dateFormat.map(DateTimeFormatter.ofPattern).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ case class KnownData(
stringDatum: String,
timeDatum: String,
timestampDatum: Timestamp,
noTZTimestampDatum: Timestamp,
nullDatum: Null
)

Expand Down Expand Up @@ -78,6 +79,7 @@ final class StaxXmlGeneratorSuite extends AnyFunSuite with BeforeAndAfterAll {
timeDatum = "12:34:56",
timestampDatum = Timestamp.from(ZonedDateTime.of(2017, 12, 20, 21, 46, 54, 0,
ZoneId.of("UTC")).toInstant),
noTZTimestampDatum = Timestamp.valueOf("2017-12-20 21:46:54"),
nullDatum = null),
KnownData(booleanDatum = false,
dateDatum = Date.valueOf("2016-12-19"),
Expand All @@ -89,15 +91,19 @@ final class StaxXmlGeneratorSuite extends AnyFunSuite with BeforeAndAfterAll {
timeDatum = "23:45:16",
timestampDatum = Timestamp.from(ZonedDateTime.of(2017, 12, 29, 17, 21, 49, 0,
ZoneId.of("America/New_York")).toInstant),
noTZTimestampDatum = Timestamp.valueOf("2017-12-29 17:21:49"),
nullDatum = null)
)

val df = dataset.toDF().orderBy("booleanDatum")
val timestampFormat = "yyyy-MM-dd HH:mm:ss.SSSXXX"
val timezone = "UTC"
val targetFile =
Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("roundtrip.xml").toString
df.write.format("xml").save(targetFile)
val newDf =
spark.read.schema(df.schema).format("xml").load(targetFile).orderBy("booleanDatum")
df.write.option("timestampFormat", timestampFormat).option("timezone", timezone).
format("xml").save(targetFile)
val newDf = spark.read.schema(df.schema).option("timestampFormat", timestampFormat).
option("timezone", timezone).format("xml").load(targetFile).orderBy("booleanDatum")
assert(df.collect().toSeq === newDf.collect().toSeq)
}

Expand Down

0 comments on commit 6969262

Please sign in to comment.