Skip to content

Commit

Permalink
[SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Adds an quoteAll option for writing CSV which will quote all fields.
See https://issues.apache.org/jira/browse/SPARK-13638

## How was this patch tested?

Added a test to verify the output columns are quoted for all fields in the Dataframe

Author: Jurriaan Pruis <[email protected]>

Closes apache#13374 from jurriaan/csv-quote-all.
  • Loading branch information
jurriaan authored and rxin committed Jul 8, 2016
1 parent 255d74f commit 38cf8f2
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 3 deletions.
7 changes: 5 additions & 2 deletions python/pyspark/sql/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ def text(self, path, compression=None):

@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
header=None, nullValue=None, escapeQuotes=None):
header=None, nullValue=None, escapeQuotes=None, quoteAll=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
Expand All @@ -658,6 +658,9 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
:param escapeQuotes: A flag indicating whether values containing quotes should always
be enclosed in quotes. If None is set, it uses the default value
``true``, escaping all values containing a quote character.
:param quoteAll: A flag indicating whether all values should always be enclosed in
quotes. If None is set, it uses the default value ``false``,
only escaping values containing a quote character.
:param header: writes the names of columns as the first line. If None is set, it uses
the default value, ``false``.
:param nullValue: sets the string representation of a null value. If None is set, it uses
Expand All @@ -667,7 +670,7 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
"""
self.mode(mode)
self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,
nullValue=nullValue, escapeQuotes=escapeQuotes)
nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll)
self._jwrite.csv(path)

@since(1.5)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
* quotes should always be enclosed in quotes. Default is to escape all values containing
* a quote character.</li>
* <li>`quoteAll` (default `false`): A flag indicating whether all values should always be
* enclosed in quotes. Default is to only escape values containing a quote character.</li>
* <li>`header` (default `false`): writes the names of columns as the first line.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str

val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10)

val quoteAll = getBool("quoteAll", false)

val inputBufferSize = 128

val isCommentSet = this.comment != '\u0000'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
writerSettings.setNullValue(params.nullValue)
writerSettings.setEmptyValue(params.nullValue)
writerSettings.setSkipEmptyLines(true)
writerSettings.setQuoteAllFields(false)
writerSettings.setQuoteAllFields(params.quoteAll)
writerSettings.setHeaders(headers: _*)
writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}

test("save csv with quoteAll enabled") {
withTempDir { dir =>
val csvDir = new File(dir, "csv").getCanonicalPath

val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
val df = spark.createDataFrame(data)

// escapeQuotes should be true by default
df.coalesce(1).write
.format("csv")
.option("quote", "\"")
.option("escape", "\"")
.option("quoteAll", "true")
.save(csvDir)

val results = spark.read
.format("text")
.load(csvDir)
.collect()

val expected = "\"test \"\"quote\"\"\",\"123\",\"it \"\"works\"\"!\",\"\"\"very\"\" well\""

assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
}
}

test("save csv with quote escaping enabled") {
withTempDir { dir =>
val csvDir = new File(dir, "csv").getCanonicalPath
Expand Down

0 comments on commit 38cf8f2

Please sign in to comment.