Skip to content

Commit

Permalink
[SPARK-18076][CORE][SQL] Fix default Locale used in DateFormat, Numbe…
Browse files Browse the repository at this point in the history
…rFormat to Locale.US

## What changes were proposed in this pull request?

Fix `Locale.US` for all usages of `DateFormat`, `NumberFormat`
## How was this patch tested?

Existing tests.

Author: Sean Owen <[email protected]>

Closes #15610 from srowen/SPARK-18076.
  • Loading branch information
srowen committed Nov 2, 2016
1 parent 70a5db7 commit 9c8deef
Show file tree
Hide file tree
Showing 31 changed files with 103 additions and 96 deletions.
8 changes: 4 additions & 4 deletions core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark
import java.io.IOException
import java.text.NumberFormat
import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
Expand Down Expand Up @@ -67,12 +67,12 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {

def setup(jobid: Int, splitid: Int, attemptid: Int) {
setIDs(jobid, splitid, attemptid)
HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(now),
HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(now),
jobid, splitID, attemptID, conf.value)
}

def open() {
val numfmt = NumberFormat.getInstance()
val numfmt = NumberFormat.getInstance(Locale.US)
numfmt.setMinimumIntegerDigits(5)
numfmt.setGroupingUsed(false)

Expand Down Expand Up @@ -162,7 +162,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
private[spark]
object SparkHadoopWriter {
def createJobID(time: Date, id: Int): JobID = {
val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
val jobtrackerID = formatter.format(time)
new JobID(jobtrackerID, id)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import java.io.IOException
import java.lang.reflect.Method
import java.security.PrivilegedExceptionAction
import java.text.DateFormat
import java.util.{Arrays, Comparator, Date}
import java.util.{Arrays, Comparator, Date, Locale}

import scala.collection.JavaConverters._
import scala.util.control.NonFatal
Expand Down Expand Up @@ -357,7 +357,7 @@ class SparkHadoopUtil extends Logging {
* @return a printable string value.
*/
private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = {
val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT)
val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US)
val buffer = new StringBuilder(128)
buffer.append(token.toString)
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.deploy.master

import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}
import java.util.concurrent.{ScheduledFuture, TimeUnit}

import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
Expand Down Expand Up @@ -51,7 +51,8 @@ private[deploy] class Master(

private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)

private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
// For application IDs
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)

private val WORKER_TIMEOUT_MS = conf.getLong("spark.worker.timeout", 60) * 1000
private val RETAINED_APPLICATIONS = conf.getInt("spark.deploy.retainedApplications", 200)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.deploy.worker
import java.io.File
import java.io.IOException
import java.text.SimpleDateFormat
import java.util.{Date, UUID}
import java.util.{Date, Locale, UUID}
import java.util.concurrent._
import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFuture}

Expand Down Expand Up @@ -68,7 +68,7 @@ private[deploy] class Worker(
ThreadUtils.newDaemonSingleThreadExecutor("worker-cleanup-thread"))

// For worker and executor IDs
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
// Send a heartbeat every (heartbeat timeout) / 4 milliseconds
private val HEARTBEAT_MILLIS = conf.getLong("spark.worker.timeout", 60) * 1000 / 4

Expand Down
5 changes: 3 additions & 2 deletions core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.rdd

import java.io.IOException
import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}

import scala.collection.immutable.Map
import scala.reflect.ClassTag
Expand Down Expand Up @@ -243,7 +243,8 @@ class HadoopRDD[K, V](

var reader: RecordReader[K, V] = null
val inputFormat = getInputFormat(jobConf)
HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(createTime),
HadoopRDD.addLocalConfiguration(
new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
context.stageId, theSplit.index, context.attemptNumber, jobConf)
reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)

Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.rdd

import java.io.IOException
import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}

import scala.reflect.ClassTag

Expand Down Expand Up @@ -79,7 +79,7 @@ class NewHadoopRDD[K, V](
// private val serializableConf = new SerializableWritable(_conf)

private val jobTrackerId: String = {
val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
formatter.format(new Date())
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.rdd

import java.nio.ByteBuffer
import java.text.SimpleDateFormat
import java.util.{Date, HashMap => JHashMap}
import java.util.{Date, HashMap => JHashMap, Locale}

import scala.collection.{mutable, Map}
import scala.collection.JavaConverters._
Expand Down Expand Up @@ -1079,7 +1079,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
// Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
val hadoopConf = conf
val job = NewAPIHadoopJob.getInstance(hadoopConf)
val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
val jobtrackerID = formatter.format(new Date())
val stageId = self.id
val jobConfiguration = job.getConfiguration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import java.lang.annotation.Annotation
import java.lang.reflect.Type
import java.nio.charset.StandardCharsets
import java.text.SimpleDateFormat
import java.util.{Calendar, SimpleTimeZone}
import java.util.{Calendar, Locale, SimpleTimeZone}
import javax.ws.rs.Produces
import javax.ws.rs.core.{MediaType, MultivaluedMap}
import javax.ws.rs.ext.{MessageBodyWriter, Provider}
Expand Down Expand Up @@ -86,7 +86,7 @@ private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{

private[spark] object JacksonMessageWriter {
def makeISODateFormat: SimpleDateFormat = {
val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'")
val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'", Locale.US)
val cal = Calendar.getInstance(new SimpleTimeZone(0, "GMT"))
iso8601.setCalendar(cal)
iso8601
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@
package org.apache.spark.status.api.v1

import java.text.{ParseException, SimpleDateFormat}
import java.util.TimeZone
import java.util.{Locale, TimeZone}
import javax.ws.rs.WebApplicationException
import javax.ws.rs.core.Response
import javax.ws.rs.core.Response.Status

private[v1] class SimpleDateParam(val originalValue: String) {

val timestamp: Long = {
val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz", Locale.US)
try {
format.parse(originalValue).getTime()
} catch {
case _: ParseException =>
val gmtDay = new SimpleDateFormat("yyyy-MM-dd")
val gmtDay = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
gmtDay.setTimeZone(TimeZone.getTimeZone("GMT"))
try {
gmtDay.parse(originalValue).getTime()
Expand Down
3 changes: 2 additions & 1 deletion core/src/main/scala/org/apache/spark/ui/UIUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ private[spark] object UIUtils extends Logging {

// SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
override def initialValue(): SimpleDateFormat =
new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)
}

def formatDate(date: Date): String = dateFormat.get.format(date)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.util.logging

import java.text.SimpleDateFormat
import java.util.Calendar
import java.util.{Calendar, Locale}

import org.apache.spark.internal.Logging

Expand Down Expand Up @@ -59,7 +59,7 @@ private[spark] class TimeBasedRollingPolicy(
}

@volatile private var nextRolloverTime = calculateNextRolloverTime()
private val formatter = new SimpleDateFormat(rollingFileSuffixPattern)
private val formatter = new SimpleDateFormat(rollingFileSuffixPattern, Locale.US)

/** Should rollover if current time has exceeded next rollover time */
def shouldRollover(bytesToBeWritten: Long): Boolean = {
Expand Down Expand Up @@ -109,7 +109,7 @@ private[spark] class SizeBasedRollingPolicy(
}

@volatile private var bytesWrittenSinceRollover = 0L
val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS")
val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS", Locale.US)

/** Should rollover if the next set of bytes is going to exceed the size limit */
def shouldRollover(bytesToBeWritten: Long): Boolean = {
Expand Down
2 changes: 1 addition & 1 deletion core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
val hour = minute * 60
def str: (Long) => String = Utils.msDurationToString(_)

val sep = new DecimalFormatSymbols(Locale.getDefault()).getDecimalSeparator()
val sep = new DecimalFormatSymbols(Locale.US).getDecimalSeparator

assert(str(123) === "123 ms")
assert(str(second) === "1" + sep + "0 s")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.deploy.rest.mesos

import java.io.File
import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}
import java.util.concurrent.atomic.AtomicLong
import javax.servlet.http.HttpServletResponse

Expand Down Expand Up @@ -62,11 +62,10 @@ private[mesos] class MesosSubmitRequestServlet(
private val DEFAULT_CORES = 1.0

private val nextDriverNumber = new AtomicLong(0)
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
private def newDriverId(submitDate: Date): String = {
"driver-%s-%04d".format(
createDateFormat.format(submitDate), nextDriverNumber.incrementAndGet())
}
// For application IDs
private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
private def newDriverId(submitDate: Date): String =
f"driver-${createDateFormat.format(submitDate)}-${nextDriverNumber.incrementAndGet()}%04d"

/**
* Build a driver description from the fields specified in the submit request.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.mllib.pmml.export

import java.text.SimpleDateFormat
import java.util.Date
import java.util.{Date, Locale}

import scala.beans.BeanProperty

Expand All @@ -34,7 +34,7 @@ private[mllib] trait PMMLModelExport {
val version = getClass.getPackage.getImplementationVersion
val app = new Application("Apache Spark MLlib").setVersion(version)
val timestamp = new Timestamp()
.addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(new Date()))
.addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US).format(new Date()))
val header = new Header()
.setApplication(app)
.setTimestamp(timestamp)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.sql.catalyst.expressions

import java.text.SimpleDateFormat
import java.util.{Calendar, TimeZone}
import java.util.{Calendar, Locale, TimeZone}

import scala.util.Try

Expand Down Expand Up @@ -331,7 +331,7 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)

override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
val sdf = new SimpleDateFormat(format.toString)
val sdf = new SimpleDateFormat(format.toString, Locale.US)
UTF8String.fromString(sdf.format(new java.util.Date(timestamp.asInstanceOf[Long] / 1000)))
}

Expand Down Expand Up @@ -400,7 +400,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {

private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
private lazy val formatter: SimpleDateFormat =
Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)

override def eval(input: InternalRow): Any = {
val t = left.eval(input)
Expand All @@ -425,7 +425,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
null
} else {
val formatString = f.asInstanceOf[UTF8String].toString
Try(new SimpleDateFormat(formatString).parse(
Try(new SimpleDateFormat(formatString, Locale.US).parse(
t.asInstanceOf[UTF8String].toString).getTime / 1000L).getOrElse(null)
}
}
Expand Down Expand Up @@ -520,7 +520,7 @@ case class FromUnixTime(sec: Expression, format: Expression)

private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
private lazy val formatter: SimpleDateFormat =
Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)

override def eval(input: InternalRow): Any = {
val time = left.eval(input)
Expand All @@ -539,9 +539,10 @@ case class FromUnixTime(sec: Expression, format: Expression)
if (f == null) {
null
} else {
Try(UTF8String.fromString(new SimpleDateFormat(
f.asInstanceOf[UTF8String].toString).format(new java.util.Date(
time.asInstanceOf[Long] * 1000L)))).getOrElse(null)
Try(
UTF8String.fromString(new SimpleDateFormat(f.toString, Locale.US).
format(new java.util.Date(time.asInstanceOf[Long] * 1000L)))
).getOrElse(null)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1415,7 +1415,7 @@ case class Sentences(
val locale = if (languageStr != null && countryStr != null) {
new Locale(languageStr.toString, countryStr.toString)
} else {
Locale.getDefault
Locale.US
}
getSentences(string.asInstanceOf[UTF8String].toString, locale)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.spark.sql.catalyst.json

import java.util.Locale

import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
import org.apache.commons.lang3.time.FastDateFormat

Expand Down Expand Up @@ -56,11 +58,11 @@ private[sql] class JSONOptions(

// Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
val dateFormat: FastDateFormat =
FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"))
FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)

val timestampFormat: FastDateFormat =
FastDateFormat.getInstance(
parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"))
parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"), Locale.US)

// Parse mode flags
if (!ParseModes.isValidMode(parseMode)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util

import java.sql.{Date, Timestamp}
import java.text.{DateFormat, SimpleDateFormat}
import java.util.{Calendar, TimeZone}
import java.util.{Calendar, Locale, TimeZone}
import javax.xml.bind.DatatypeConverter

import scala.annotation.tailrec
Expand Down Expand Up @@ -79,14 +79,14 @@ object DateTimeUtils {
// `SimpleDateFormat` is not thread-safe.
val threadLocalTimestampFormat = new ThreadLocal[DateFormat] {
override def initialValue(): SimpleDateFormat = {
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
}
}

// `SimpleDateFormat` is not thread-safe.
private val threadLocalDateFormat = new ThreadLocal[DateFormat] {
override def initialValue(): SimpleDateFormat = {
new SimpleDateFormat("yyyy-MM-dd")
new SimpleDateFormat("yyyy-MM-dd", Locale.US)
}
}

Expand Down
Loading

6 comments on commit 9c8deef

@ebuildy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello, why have hardcoded US?

@srowen
Copy link
Member Author

@srowen srowen commented on 9c8deef Jan 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ebuildy (Ask on the mailing list please). These aren't uses that are sensitive to any user locale and indeed can't vary by system locale, or else things break.

@ebuildy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, there are a big one: week of the year is not the same between weekofyear Spark function and date_format("w"), I will ask on mailing list, thanks you.

@srowen
Copy link
Member Author

@srowen srowen commented on 9c8deef Jan 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that all of those functions should at least be deterministic w.r.t. the platform's locale. Those that don't take a locale as an argument would default to the US locale to generally match Hive and other DB semantics. Where you specify a locale, you may get different results.

@ebuildy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's a good idea, especially with cloud computing where Spark can run ... every-where ^^

so, is it still possible to specify a locale for date_format SQL function?

@srowen
Copy link
Member Author

@srowen srowen commented on 9c8deef Jan 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can specify a format, not a locale. That function is there to mimic Hive. Although it's not out of the question to overload it further, I think you'd be advised to write a UDF if you wanted more control.

Please sign in to comment.