Use defined timezone on write for formats that need TZ info (#665)

* Use defined timezone on write for formats that need TZ info * Add test
databricks · Oct 10, 2023 · 6969262 · 6969262
1 parent b2611bd
commit 6969262
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -68,6 +68,7 @@ Defaults to `false`. New in 0.11.0.
 columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
 Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT),
 including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used.
+* `timezone`: identifier of timezone to be used when reading timestamps without a timezone specified. New in 0.16.0.
 * `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType` 
 columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
 Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.
@@ -86,6 +87,7 @@ When writing files the API accepts several options:
 * `timestampFormat`: Controls the format used to write `TimestampType` format columns.
 The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
 Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used.
+* `timezone`: identifier of timezone to be used when writing timestamps without a timezone specified. New in 0.16.0.
 * `dateFormat`: Controls the format used to write `DateType` format columns.
 The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html).
 Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0.

diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala
@@ -16,6 +16,7 @@
 package com.databricks.spark.xml.parsers
 
 import java.sql.{Date, Timestamp}
+import java.time.ZoneId
 import java.time.format.DateTimeFormatter
 
 import javax.xml.stream.XMLStreamWriter
@@ -85,7 +86,8 @@ private[xml] object StaxXmlGenerator {
       case (StringType, v: String) => writer.writeCharacters(v)
       case (TimestampType, v: Timestamp) =>
         val formatter = options.timestampFormat.map(DateTimeFormatter.ofPattern).
-          getOrElse(DateTimeFormatter.ISO_INSTANT)
+          getOrElse(DateTimeFormatter.ISO_INSTANT).
+          withZone(options.timezone.map(ZoneId.of).orNull)
         writer.writeCharacters(formatter.format(v.toInstant()))
       case (DateType, v: Date) =>
         val formatter = options.dateFormat.map(DateTimeFormatter.ofPattern).

diff --git a/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlGeneratorSuite.scala b/src/test/scala/com/databricks/spark/xml/parsers/StaxXmlGeneratorSuite.scala
@@ -36,6 +36,7 @@ case class KnownData(
     stringDatum: String,
     timeDatum: String,
     timestampDatum: Timestamp,
+    noTZTimestampDatum: Timestamp,
     nullDatum: Null
 )
 
@@ -78,6 +79,7 @@ final class StaxXmlGeneratorSuite extends AnyFunSuite with BeforeAndAfterAll {
         timeDatum = "12:34:56",
         timestampDatum = Timestamp.from(ZonedDateTime.of(2017, 12, 20, 21, 46, 54, 0,
           ZoneId.of("UTC")).toInstant),
+        noTZTimestampDatum = Timestamp.valueOf("2017-12-20 21:46:54"),
         nullDatum = null),
       KnownData(booleanDatum = false,
         dateDatum = Date.valueOf("2016-12-19"),
@@ -89,15 +91,19 @@ final class StaxXmlGeneratorSuite extends AnyFunSuite with BeforeAndAfterAll {
         timeDatum = "23:45:16",
         timestampDatum = Timestamp.from(ZonedDateTime.of(2017, 12, 29, 17, 21, 49, 0,
           ZoneId.of("America/New_York")).toInstant),
+        noTZTimestampDatum = Timestamp.valueOf("2017-12-29 17:21:49"),
         nullDatum = null)
     )
 
     val df = dataset.toDF().orderBy("booleanDatum")
+    val timestampFormat = "yyyy-MM-dd HH:mm:ss.SSSXXX"
+    val timezone = "UTC"
     val targetFile =
       Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("roundtrip.xml").toString
-    df.write.format("xml").save(targetFile)
-    val newDf =
-      spark.read.schema(df.schema).format("xml").load(targetFile).orderBy("booleanDatum")
+    df.write.option("timestampFormat", timestampFormat).option("timezone", timezone).
+      format("xml").save(targetFile)
+    val newDf = spark.read.schema(df.schema).option("timestampFormat", timestampFormat).
+      option("timezone", timezone).format("xml").load(targetFile).orderBy("booleanDatum")
     assert(df.collect().toSeq === newDf.collect().toSeq)
   }